Classifiers as a Sentence Encoder

In [28]:
import pathlib
import numpy as np
import pandas as pd

from tokenwiser.pipeline import make_partial_pipeline, make_partial_union, make_concat
from tokenwiser.textprep import SentencePiecePrep, Identity

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier
from sklearn.base import clone

In [30]:
from rich.progress import Progress
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_score, accuracy_score

class Dataset:
    def __init__(self, path, text_col, label_col):
        dataf = pd.read_csv(path)
        self.train = dataf.loc[lambda d: d['split'] == 'train'].reset_index()
        self.valid = dataf.loc[lambda d: d['split'] == 'valid'].reset_index()
        self.labels = list(dataf[label_col].unique())
        self.text_col = text_col
        self.label_col = label_col
        weights = compute_class_weight('balanced', classes=self.labels, y=self.train[label_col])
        self.label_weights = {l:w for l, w in zip(self.labels, weights)}
    
    def batch(self, n=2_000):
        indices = np.random.randint(len(self.train), size=n)
        subset = self.train.iloc[indices]
        label_arr = np.array(subset[self.label_col]).reshape(-1, 1)
        return list(subset[self.text_col]), list(subset[self.label_col])
    
    def valid_set(self):
        return list(self.valid[self.text_col]), list(self.valid[self.label_col])

class CaaSE:
    def __init__(self, prep):
        self.prep = prep
        self.datasets = {}
        self.models = {}
    
    def add_clf_task(self, path, text_col="text", label_col="label", name=None):
        name = pathlib.Path(path).stem if not name else name
        self.datasets[name] = Dataset(path, text_col, label_col)
        self.models[name] = SGDClassifier(loss='log', class_weight=self.datasets[name].label_weights)
        return self
    
    def train(self, epochs=10, batch_size=10_000, log_valid=False):
        for name in self.datasets.keys():
            with Progress() as progress:
                description = f"Training [bold]{name}[/bold]..."
                task = progress.add_task(description, total=epochs)
                for epoch in range(1, epochs + 1):
                    text, y = self.datasets[name].batch(n=batch_size)
                    labels = self.datasets[name].labels
                    X = self.prep.fit(text).transform(text)
                    self.models[name].partial_fit(X, y, classes=labels)

                    if log_valid:
                        if epoch % log_valid == 0:
                            text_valid, y_valid = self.datasets[name].valid_set()
                            X_valid = self.prep.transform(text_valid)
                            preds = self.models[name].predict(X_valid)
                            pre_score = np.round(precision_score(y_valid, preds, average='weighted'), 4)
                            acc_score = np.round(accuracy_score(y_valid, preds), 4)
                            description=f"Training [bold]{name}[/bold] acc={acc_score} precision={pre_score}"
                    
                    progress.update(task, advance=1, description=description + f" epoch={epoch}")
    
    def parse(self, text):
        result = {}
        for name in self.datasets.keys():
            X = self.prep.fit([text]).transform([text])
            result[name] = self.models[name].predict(X)[0]
        return result
    
    def parse_proba(self, text):
        result = {}
        for name in self.datasets.keys():
            X = self.prep.fit([text]).transform([text])
            probas = self.models[name].predict_proba(X)[0]
            classes = self.models[name].classes_
            result[name] = {c: p for c, p in zip(classes, probas)}
        return result
    
    def emb_(self, text):
        return [v for k, vals in case_study.parse_proba(text).items() for v in vals.values()]
    
    def transform(self, X):
        return np.array([self.emb_(x) for x in X])

In [31]:
pipe_prep = make_partial_pipeline(
    make_concat(
        Identity(),
        SentencePiecePrep(model_file="pretrained/en.wiki.bpe.vs5000.model"),
    ),
    make_partial_union(
        HashingVectorizer(n_features=997),
        HashingVectorizer(n_features=998),
        HashingVectorizer(n_features=1000),
        HashingVectorizer(n_features=2000, ngram_range=(1, 3)),
        HashingVectorizer(n_features=2001, ngram_range=(1, 3)),
        HashingVectorizer(n_features=2001, ngram_range=(1, 3)),
    )
)

In [32]:
case_study = CaaSE(prep=pipe_prep)

emotions = ['admiration', 'remorse', 'sadness', 'surprise', 'neutral',
            'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
            'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
            'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
            'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief']

# for emotion in emotions:
#     case_study.add_clf_task(path="data/google-emotions.csv", label_col=emotion, name=f"go-{emotion}")

(case_study
 .add_clf_task(path="data/tweet_eval-sentiment.csv", name="sentiment")
 .add_clf_task(path="data/tweet_eval-emotion.csv", name="emotion")
 .add_clf_task(path="data/dbpedia_14-None.csv", name="dbpedia")
 .add_clf_task(path="data/clinc_oos-plus.csv", name="clinc")
 .add_clf_task(path="data/tweet_eval-emoji.csv", name="emoji"))

<__main__.CaaSE at 0x7f5512f6ed50>

In [None]:
%%time

case_study.train(epochs=30, log_valid=5)

Output()

Output()

Output()

In [32]:
from whatlies import Embedding, EmbeddingSet
from whatlies.transformers import Pca, Umap

In [33]:
df = pd.read_csv("data/google-emotions.csv")[['text']]

In [34]:
words = list(set(df.sample(1000)['text']))

embset = EmbeddingSet(*[Embedding(w, case_study.transform([w])[0]) for w in words])

In [35]:
embset.transform(Pca(2)).plot_interactive(annot=False)

In [36]:
embset.transform(Umap(2)).plot_interactive(annot=False)