In [None]:
from embedders.classification.contextual import TransformerSentenceEmbedder
import pandas as pd
import numpy as np
import json

# Load Model and Raw Data
For the Model we use the embedders library which make embedding generation really easy to use. (alternatively you can for example use "sentence-transformers")

We are also using the **kern export format** here, which is a simple json that can be read from pandas directly. 

If you're using a csv from an Excel export, just modify this code here!

In [None]:
embedder = TransformerSentenceEmbedder("distilbert-base-cased")

In [None]:
path = "labeled_data_v1.json"
with open(path, "r") as f:
    data = json.load(f)
    
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
# to get all the context information we have to merge headline and body
df["merged_texts"] = df["headline"] + ". "+ df["body"]

# Embedd the texts

In [None]:
embeddings = np.array(embedder.transform(df["merged_texts"].values.tolist()))

In [None]:
# you have the option to save the embeddings so you don't need to re-calculate them
np.save("embeddings", embeddings)

# Recommendations using Vector Calculations

In [None]:
from scipy.spatial.distance import cdist

In [None]:
# average the interesting vector
interesting_idxs = df[df["__Interesting__MANUAL"] == "yes"].index
interesting_vector_avg = embeddings[interesting_idxs].mean(axis=0)

In [None]:
# calculate the distances to the unlabeled data
non_labeled_idxs = df[df["__Interesting__MANUAL"].isnull()].index
dist_to_unlabeled = cdist(interesting_vector_avg.reshape(1,-1), embeddings[non_labeled_idxs], metric="cosine")[0]

In [None]:
# sort the indices ascending
sorted_unlabeled_idxs = dist_to_unlabeled.argsort()

# translate them back to the original dataframe
sorted_original_idxs = non_labeled_idxs[sorted_unlabeled_idxs]

In [None]:
top_10_recommendations = df.loc[sorted_original_idxs[0:10]]
top_10_recommendations.head()

# Predict Topics covered by the article
There are some problems when it comes to classifying topics in this setting. The most dominant one is that we don't know if the topics we selected are even all the topics that exist, for that we'd have to label every datapoint and also make sure that in the future there are no topics coming up that weren't in the training. Second, this data is rather unbalanced. We will not deal with these problems here and continue with our baseline usecase.

Instead we will choose the topics that we want to have classified and which have enough support. We then introduce a "catch-all" class, where we map all other labels to.

We will split the data into train and test set, train the model, and then evaluate it very quickly. We will not go into too much detail of the whole pipeline (it takes companies months to make sense of their data and models!) as this is not the aim of this workshop.

## Prepare Train and Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# get all the labeled instances
labeled_idxs = df[~df["__Topic__MANUAL"].isnull()].index.tolist()

In [None]:
# look what labels have enough support
df.loc[labeled_idxs]["__Topic__MANUAL"].value_counts()

In [None]:
# choose the labels that have enough support
topics = ["big tech", "research  and science", "library/code", "social media"]

In [None]:
labeled_df = df.loc[labeled_idxs]
labeled_df = labeled_df[labeled_df["__Topic__MANUAL"].isin(topics)]

In [None]:
train_idx, test_idx = train_test_split(labeled_df.index.tolist(), test_size = 0.2)

In [None]:
X_train = embeddings[train_idx]
X_test = embeddings[test_idx]
y_train = df.loc[train_idx]["__Topic__MANUAL"]
y_test = df.loc[test_idx]["__Topic__MANUAL"]

## Train and evaluate a classifier
We can simulate the classification layer of a typical BERT pipeline with a LogisticRegression sklearn model.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
clf = LogisticRegression().fit(X_train, y_train)

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

## Predict the topics

In [None]:
clf = LogisticRegression().fit(embeddings[labeled_df.index], labeled_df["__Topic__MANUAL"])

In [None]:
# get the unlabeled data
unlabeled_df = df.drop(labeled_df.index)["__Topic__MANUAL"]
unlabeled_idxs = unlabeled_df.index

X = embeddings[unlabeled_idxs]

In [None]:
probs = clf.predict_proba(X)

In [None]:
pred_idx, pred_class = np.where(probs > 0.75)
pred_class_text = list(map(lambda x: clf.classes_[x],pred_class))

In [None]:
df["topic"] = "Unknown"
df.loc[labeled_df.index,"topic"] = df.loc[labeled_df.index]["__Topic__MANUAL"]
df.loc[pred_idx,"topic"] = pred_class_text

In [None]:
df["topic"].value_counts()

In [None]:
# save the data with predicted topics to disk
df[['newsletter', 'date', 'headline', 'body', '__Interesting__MANUAL', 'merged_texts', 'topic']].to_csv("output.csv", index=False, quoting=1)