In [None]:
import numpy as np
import pickle
import pandas as pd

from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

Source: https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb


First, load the data set:

In [None]:
# load the data from the csv file
df = pd.read_csv("/content/drive/MyDrive/1Jupyter/SCRIPTIE/data.csv")

# load the mlb files back in, to get the classes and transform functions
with open("/content/drive/MyDrive/1Jupyter/SCRIPTIE/dom_mlb.pkl", "rb") as f:
    dom_mlb = pickle.load(f)

with open("/content/drive/MyDrive/1Jupyter/SCRIPTIE/sub_mlb.pkl", "rb") as f:
    sub_mlb = pickle.load(f)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Next, we want to tokenize all elements in X:

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
df["tokenized_text"] = df["text"].apply(lambda x: tokenizer(x, add_special_tokens=False))
print(type(df["tokenized_text"][2]))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors


<class 'transformers.tokenization_utils_base.BatchEncoding'>


After tokenizing, we need to split the text up into chunks of 512, so the sentence bert model can generate its embeddings.

In [33]:
def chunk_text(tokens, chunk_size=512):
  if len(tokens["input_ids"]) <= chunk_size:
    return [tokenizer.decode(tokens["input_ids"])]

  chunks = []
  for i in range(0, len(tokens["input_ids"]), chunk_size):
      chunk = {k: t[i:i + chunk_size] for k, t in tokens.items()}
      chunks.append(tokenizer.decode(chunk["input_ids"]))

  return chunks

In [34]:
df["chunked_text"] = df["tokenized_text"].apply(lambda x: chunk_text(x, 512))

The texts are successfully split into chunks with a maximum length of 512, so now let's generate the embeddings. We used an SBert model for this, but

In [35]:
bertmodel = SentenceTransformer("all-mpnet-base-v2")



In [44]:
# this code generates the dataframe of embeddings. Per document, it encodes every chunk, and averages it into the final embedding by the number of words per chunk
df["embeddings"] = df["chunked_text"].apply(
    lambda x: np.average([bertmodel.encode(i) for i in x],
                         weights = [len(i) for i in x],
                         axis = 0))

In [45]:
print(df["embeddings"])
print(len(df["embeddings"][0]))

0      [0.052875560662682466, 0.006566362972984408, 0...
1      [-0.014265622943639755, 0.07552964240312576, 0...
2      [0.061845727549616696, 0.0131503199399182, 0.0...
3      [0.0640306249429087, -0.030271484169680164, 8....
4      [-0.012554287910461426, 0.020935364067554474, ...
                             ...                        
621    [-0.016552619636058807, -0.0051891556940972805...
622    [-0.004387491492365124, 0.05862265675502308, -...
623    [0.023696497082710266, -0.02204945497214794, 0...
624    [0.015810714544066278, 0.04745909140383168, 0....
625    [0.03934312239289284, 0.003352779895067215, 0....
Name: embeddings, Length: 626, dtype: object
768


Lastly, lets save the embeddings now that we have them. The embeddings are saved as a pickle file.

In [74]:
with open("/content/drive/MyDrive/1Jupyter/SCRIPTIE/embeddings.pkl", "wb") as f:
    pickle.dump(df["embeddings"], f)

In [75]:
with open("/content/drive/MyDrive/1Jupyter/SCRIPTIE/embeddings.pkl", "rb") as f:
    df["embeddings"] = pickle.load(f)

Now that all of the texts are embedded, let's try predicting something. So let's predict the dominant classes with a logistic regression classifier.

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

# when training on sub categories, some simply do not have enough data in the 600 point dataset... so for now only train it on the dominant categories
def logistic_regression_classifier(X_train, y_train, X_test):
    lr = MultiOutputClassifier(LogisticRegression(class_weight="balanced", solver= "liblinear", max_iter=100))
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    return y_pred


Now, lets try to make some predictions using the bert embeddings:

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = np.vstack(df["embeddings"].values)
y = df[dom_mlb.classes_].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

y_pred = logistic_regression_classifier(X_train=X_train,y_train=y_train,X_test=X_test)

print(classification_report(y_true=y_test,y_pred=y_pred,target_names=dom_mlb.classes_))

                                                        precision    recall  f1-score   support

                          CC: Amplifying Climate Fears       0.54      1.00      0.70         7
                      CC: Climate change is beneficial       0.00      0.00      0.00         0
              CC: Controversy about green technologies       0.20      1.00      0.33         1
                     CC: Criticism of climate movement       0.44      1.00      0.62         4
                     CC: Criticism of climate policies       0.40      0.80      0.53         5
         CC: Criticism of institutions and authorities       0.40      1.00      0.57         6
                        CC: Downplaying climate change       0.22      1.00      0.36         2
       CC: Green policies are geopolitical instruments       0.00      0.00      0.00         0
 CC: Hidden plots by secret schemes of powerful groups       0.29      0.67      0.40         3
          CC: Questioning the measureme

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Not amazing, but at least we now know the embedding is working as it should!