In [24]:
import numpy as np
import pickle
import pandas as pd

from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

Source: https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb


First, load the data set:

In [25]:
# load the data from the csv file
df = pd.read_csv("/content/drive/MyDrive/1Jupyter/SCRIPTIE/data.csv")

# load the mlb files back in, to get the classes and transform functions
with open("/content/drive/MyDrive/1Jupyter/SCRIPTIE/dom_mlb.pkl", "rb") as f:
    dom_mlb = pickle.load(f)

with open("/content/drive/MyDrive/1Jupyter/SCRIPTIE/sub_mlb.pkl", "rb") as f:
    sub_mlb = pickle.load(f)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Next, we want to tokenize all elements in X:

In [26]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
df["tokenized_text"] = df["text"].apply(lambda x: tokenizer(x, add_special_tokens=False))
print(type(df["tokenized_text"][2]))

Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors


<class 'transformers.tokenization_utils_base.BatchEncoding'>


After tokenizing, we need to split the text up into chunks of 512, so the sentence bert model can generate its embeddings.

In [27]:
def chunk_text(tokens, chunk_size=512):
  if len(tokens["input_ids"]) <= chunk_size:
    #print(tokens["input_ids"], "no chunk needed")
    return [tokenizer.decode(tokens["input_ids"])]

  chunks = []
  for i in range(0, len(tokens["input_ids"]), chunk_size):
      chunk = {k: t[i:i + chunk_size] for k, t in tokens.items()}
      chunks.append(tokenizer.decode(chunk["input_ids"]))

  return chunks

In [28]:
chunked_text = df["tokenized_text"].apply(lambda x: chunk_text(x, 512))
print(chunked_text)

0      [putin honours army unit blamed for bucha mass...
1      [europe putin thanks us journalist tucker carl...
2      [russia has a clear plan to resolve the confli...
3      [first war of tiktok era sees tragedy, humor a...
4      [ukraine's president zelenskyy to address mexi...
                             ...                        
621    [united kingdom : the country has been invaded...
622    [" the fight against climate change is not on ...
623    [eu pulls out of hungarian foreign ministers'm...
624    [the new bonfires of the inquisition 1. the re...
625    [russia is giving up on kharkiv after ‘ failur...
Name: tokenized_text, Length: 626, dtype: object


In [29]:
bertmodel = SentenceTransformer("all-mpnet-base-v2")



In [30]:
# maybe find a way to weigh the mean with the length of the text?
df["embeddings"] = chunked_text.apply(lambda x: np.mean([bertmodel.encode(i) for i in x], axis = 0))

In [31]:
print(df["embeddings"])

0      [0.05744616, 0.008171466, 0.039495505, -0.0038...
1      [-0.014265623, 0.07552964, 0.0051879473, 0.003...
2      [0.061988987, 0.007321683, 0.01204624, 0.01062...
3      [0.0643855, -0.030406523, -0.00027157273, -0.0...
4      [-0.012554288, 0.020935364, 0.0065615587, 0.02...
                             ...                        
621    [-0.01655262, -0.0051891557, -0.0009252665, -0...
622    [-0.005790931, 0.07960883, 0.0007383786, 0.038...
623    [0.023696497, -0.022049455, 0.0069245356, 0.00...
624    [0.008318179, 0.040885136, 0.016485114, 0.0215...
625    [0.039343122, 0.00335278, 0.018099703, 0.00574...
Name: embeddings, Length: 626, dtype: object


Now that all of the texts are embedded, we want to predict something. So let's predict the dominant classes with a logistic regression classifier. The lr classifier is as follows:

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

# when training on sub categories, some simply do not have enough data in the 600 point dataset... so for now only train it on the dominant categories
def logistic_regression_classifier(X_train, y_train, X_test):
    lr = MultiOutputClassifier(LogisticRegression(class_weight="balanced", solver= "liblinear", max_iter=100))
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    return y_pred


Now, lets try to make some predictions using the bert embeddings:

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = np.vstack(df["embeddings"].values)
y = df[dom_mlb.classes_].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

y_pred = logistic_regression_classifier(X_train=X_train,y_train=y_train,X_test=X_test)

print(classification_report(y_true=y_test,y_pred=y_pred,target_names=dom_mlb.classes_))

                                                        precision    recall  f1-score   support

                          CC: Amplifying Climate Fears       0.56      0.97      0.71        39
                      CC: Climate change is beneficial       0.00      0.00      0.00         2
              CC: Controversy about green technologies       0.29      0.50      0.36         8
                     CC: Criticism of climate movement       0.34      0.86      0.49        14
                     CC: Criticism of climate policies       0.24      1.00      0.38        17
         CC: Criticism of institutions and authorities       0.39      0.97      0.55        29
                        CC: Downplaying climate change       0.18      1.00      0.31         7
       CC: Green policies are geopolitical instruments       0.00      0.00      0.00         1
 CC: Hidden plots by secret schemes of powerful groups       0.11      0.75      0.19         4
          CC: Questioning the measureme

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Let's see how well the model performs when utilizing LOOCV:

In [37]:
from sklearn.model_selection import LeaveOneOut
from collections import Counter

# function used by the majority classifier and the logistic regression with LOOCV to find the most commonly occuring label in a data set
def find_majority(y_train):
    y_tuples = [tuple(y) for y in y_train]
    max_y = Counter(y_tuples).most_common(1)
    max_y = np.array(max_y[0][0])
    return max_y

# Does the logistic regression n times, if the data set is quite small
def logistic_regression_loocv(X, y, majority_ensemble = False, penalty ="l2", solver="liblinear", max_iter = 100):
    # initialize the cross validation
    cv = LeaveOneOut()
    cv.get_n_splits(X)

    # output arrays of the predicted label for that instance of y and the true label, so we can evaluate it later
    labels_pred = []
    labels_true = []

    # Most common label = Other
    max_y = find_majority(y)

    for i, (train_index, test_index) in enumerate(cv.split(X)):
        # Get the train and test instances for this fold
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]

        # LR doesn't work if one of the classes has no positive instances, skip if this happens
        valid_y = np.any(y_train != 0, axis=0)
        if not np.all(valid_y):
            print(f"Skipped fold {i}, labels invalid")
            continue

        # If not invalid, train the classifier
        lr = MultiOutputClassifier(LogisticRegression(class_weight="balanced", solver=solver, max_iter=max_iter, penalty=penalty))
        lr.fit(X_train,y_train)

        # Make prediction for the test instance
        y_pred = lr.predict(X_test)

        if(majority_ensemble):
            if np.all(y_pred == 0):
                y_pred = max_y

        labels_pred.append(y_pred)
        labels_true.append(y_test)

    # good format for classification report
    labels_pred = np.vstack(labels_pred)
    labels_true = np.vstack(labels_true)
    # print("Predicted Labels: ", labels_pred)
    # print("True Labels: ", labels_true)

    return labels_pred, labels_true

In [38]:
X = np.vstack(df["embeddings"].values)
y = df[dom_mlb.classes_].values

y_pred, y_true = logistic_regression_loocv(X,y,majority_ensemble=True)
print(classification_report(y_true = y_true, y_pred=y_pred, target_names=dom_mlb.classes_))

                                                        precision    recall  f1-score   support

                          CC: Amplifying Climate Fears       0.59      0.96      0.73        70
                      CC: Climate change is beneficial       0.08      0.67      0.14         3
              CC: Controversy about green technologies       0.20      0.80      0.31        10
                     CC: Criticism of climate movement       0.30      0.91      0.45        23
                     CC: Criticism of climate policies       0.31      0.89      0.46        38
         CC: Criticism of institutions and authorities       0.41      0.95      0.57        56
                        CC: Downplaying climate change       0.22      0.87      0.35        15
       CC: Green policies are geopolitical instruments       0.00      0.00      0.00         2
 CC: Hidden plots by secret schemes of powerful groups       0.18      0.73      0.29        11
          CC: Questioning the measureme