In [1]:
import numpy as np
import pickle
import pandas as pd
import os

from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

Source: https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb


First, load the data set:

In [2]:
# load the data from the csv file
df = pd.read_csv("../data/newdata.csv")

# load the mlb files back in, to get the classes and transform functions
with open("../pkl_files/dom_mlb.pkl", "rb") as f:
    dom_mlb = pickle.load(f)

with open("../pkl_files/sub_mlb.pkl", "rb") as f:
    sub_mlb = pickle.load(f)


Next, we want to tokenize all elements in X:

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
df["tokenized_text"] = df["text"].apply(lambda x: tokenizer(x, add_special_tokens=False))
print(type(df["tokenized_text"][2]))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors


<class 'transformers.tokenization_utils_base.BatchEncoding'>


After tokenizing, we need to split the text up into chunks of 512, so the sentence bert model can generate its embeddings.

In [4]:
def chunk_text(tokens, chunk_size=512):
  if len(tokens["input_ids"]) <= chunk_size:
    return [tokenizer.decode(tokens["input_ids"])]

  chunks = []
  for i in range(0, len(tokens["input_ids"]), chunk_size):
      chunk = {k: t[i:i + chunk_size] for k, t in tokens.items()}
      chunks.append(tokenizer.decode(chunk["input_ids"]))

  return chunks

In [5]:
df["chunked_text"] = df["tokenized_text"].apply(lambda x: chunk_text(x, 512))

The texts are successfully split into chunks with a maximum length of 512, so now let's generate the embeddings. We used an SBert model for this, but

In [6]:
# function to generate the embeddings, takes a df with "chunked_text" and a bertmodel and returns a pd DF with the embeddings
def generate_embeddings(df, bertmodel):
  return df["chunked_text"].apply(lambda x: np.average([bertmodel.encode(i) for i in x],
                                                        weights = [len(i) for i in x],
                                                        axis = 0))

Lets generate some embeddings for a couple different SBert models:
https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

In [10]:
import sys
# suppresses the loading bars that the models have, to reduce spam
original_stdout = sys.stdout
sys.stdout = open(os.devnull, 'w')
SBERTmodels = {
    "mpnet": SentenceTransformer("all-mpnet-base-v2"),
    "multiqa": SentenceTransformer("multi-qa-mpnet-base-dot-v1"),
    "distilroberta": SentenceTransformer("all-distilroberta-v1"),
    "minilm": SentenceTransformer("all-MiniLM-L12-v2"),
    "paraphrasemultilang": SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
}
sys.stdout = original_stdout

model.safetensors:  44%|####4     | 493M/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

# when training on sub categories, some simply do not have enough data in the 600 point dataset... so for now only train it on the dominant categories
def logistic_regression_classifier(X_train, y_train, X_test):
    lr = MultiOutputClassifier(LogisticRegression(class_weight="balanced", solver= "liblinear", max_iter=100))
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    return y_pred


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# this code generates the dataframe of embeddings. Per document, it encodes every chunk, and averages it into the final embedding by the number of words per chunk
f1_scores =  {}
embeddings = {}

for model_name, model in SBERTmodels.items():
  embeddings[model_name] = generate_embeddings(df, model)

  X = np.vstack(embeddings[model_name].values)
  y = df[dom_mlb.classes_].values

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
  y_pred = logistic_regression_classifier(X_train=X_train,y_train=y_train,X_test=X_test)

  class_report = classification_report(y_true=y_test,y_pred=y_pred,target_names=dom_mlb.classes_, output_dict=True, zero_division=0.0)

  print(f"{model_name} embeddings:")
  print(classification_report(y_true=y_test,y_pred=y_pred,target_names=dom_mlb.classes_, zero_division=0.0))
  print("----------------------------------------------------------------------------------")

  f1_scores[model_name] = class_report["samples avg"]['f1-score']

Not amazing, but at least we now know the embedding is working as it should!

Lastly, lets save the best embeddings now that we have them. The embeddings are saved as a pickle file. to do this, we take the max of the f1 sample scores, for which we want to optimize the task.

In [13]:
bestembeddings = embeddings[max(f1_scores)]
print(bestembeddings)

with open("../pkl_files/embeddings.pkl", "wb") as f:
    pickle.dump(bestembeddings, f)

Use the following code to open up the embedding file

In [14]:
with open("../pkl_files/embeddings.pkl", "rb") as f:
    bestembeddings = pickle.load(f)

print(bestembeddings)