# Train a split classifier
Given two texts, train a classifier to predict whether they belong to the same split.

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import json
import os
import pickle

import cohere
import matplotlib.pyplot as plt
import numpy as np
import openai
import pandas as pd
from scipy.stats import loguniform, uniform, ttest_ind, mannwhitneyu
from sentence_transformers import SentenceTransformer
import spacy
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from transformers import pipeline

from models.split_model_train import get_labeled_pairs, \
    predict_using_syntactic_features, syntactic_paragraph_features,\
    predict_using_embeddings, predict_using_pairs, predict_using_features_and_embeddings, \
    predict_using_features_and_greedy_embeddings, predict_using_features_and_ensemble
from models.split_utils import get_mpnet_embedder, get_openai_embedder, get_voyageai_embedder
from models.split_model_eval import evaluate, compare, evaluate_embedder


In [None]:
# configure
input_dir = '../data/split/labeled/'
input_filename = '2023-09-23.json'

output_dir = '../data/split/model/'
today = datetime.today().strftime('%Y-%m-%d')

random_state = 42
ngram_size = 1

In [None]:
# avoid warning
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

# spacy
parser = spacy.load("en_core_web_sm")

# cohere
# cohere_api_key = os.environ['COHERE_KEY']
# co = cohere.Client(cohere_api_key)
# cohere_embedder = get_cohere_embedder(co)

# bert-wiki-paragraphs
# pipe = pipeline("text-classification", model="dennlinger/bert-wiki-paragraphs")
# bert_wiki_paras_scorer = get_bert_wiki_paras_scorer(pipe)

# mpnet
mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
mpnet_embedder = get_mpnet_embedder(mpnet)

# openai
openai.api_key = os.environ['OPENAI_API_KEY']
openai.Engine.list()
openai_embedder = get_openai_embedder(openai)


In [None]:
import torch

torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

In [None]:
# read labeled data
with open(os.path.join(input_dir, input_filename)) as f:
    talk_sections = json.load(f)

## Split the data into train and test sections

In [None]:
train_sections, test_sections = train_test_split(talk_sections, 
                                                 test_size=0.2, 
                                                 random_state=random_state)
print('train', len(train_sections), 'test', len(test_sections))

## First try various unsupervised approaches

In [None]:
# evaluate grouping paragraphs by purely syntactic features, such as whether the paragraph is a list item or very short or a quote
# the idea is to use purely syntactic features to group (a few) paragraphs that should be grouped,
# but to never group paragraphs that shouldn't be grouped
results = evaluate(train_sections, predict_using_syntactic_features(syntactic_paragraph_features), debug=True)
results['metrics']

### Let's try a few embeddings

In [None]:
train_sections[1]

In [None]:
def random_embedder(paragraphs):
    random_vectors = []
    for _ in paragraphs:
        # Generate a random vector with specified length
        random_vector = np.random.rand(10)
        random_vector /= np.linalg.norm(random_vector)
        random_vectors.append(random_vector)
    return random_vectors

In [None]:
# graph random embeddings
pos_sims, neg_sims = \
        evaluate_embedder(train_sections, results['predictions'], random_embedder)
# larger ttest is better, smaller mann is better
t_ttest, _ = ttest_ind(neg_sims, pos_sims, equal_var=False)
t_mann, _  = mannwhitneyu(neg_sims, pos_sims, use_continuity=False)
print(t_ttest, t_mann)
plt.hist([pos_sims, neg_sims], np.linspace(0.0, 1.0, 100), label=['split', 'same'])
plt.legend(loc='upper right')
plt.show()

In [None]:
# graph openai embeddings
pos_sims, neg_sims = \
        evaluate_embedder(train_sections, results['predictions'], openai_embedder)
# larger ttest is better, smaller mann is better
t_ttest, _ = ttest_ind(neg_sims, pos_sims, equal_var=False)
t_mann, _  = mannwhitneyu(neg_sims, pos_sims, use_continuity=False)
print(t_ttest, t_mann)
print('neg mean', sum(neg_sims)/len(neg_sims), 'pos mean', sum(pos_sims)/len(pos_sims))
plt.hist([pos_sims, neg_sims], np.linspace(0.7, 1.0, 100), label=['split', 'same'])
plt.legend(loc='upper right')
plt.show()

In [None]:
# graph mpnet embeddings
pos_sims, neg_sims = \
        evaluate_embedder(train_sections, results['predictions'], mpnet_embedder)
# larger ttest is better, smaller mann is better
t_ttest, _ = ttest_ind(neg_sims, pos_sims, equal_var=False)
t_mann, _  = mannwhitneyu(neg_sims, pos_sims, use_continuity=False)
print(t_ttest, t_mann)
print('neg mean', sum(neg_sims)/len(neg_sims), 'pos mean', sum(pos_sims)/len(pos_sims))
plt.hist([pos_sims, neg_sims], np.linspace(0.0, 1.0, 100), label=['split', 'same'])
plt.legend(loc='upper right')
plt.show()

In [None]:
bge = SentenceTransformer('BAAI/bge-base-en-v1.5')

def bge_embedder(paragraphs):
    return bge.encode(paragraphs, normalize_embeddings=True)

In [None]:
# graph bge embeddings
pos_sims, neg_sims = \
        evaluate_embedder(train_sections, results['predictions'], bge_embedder)
# larger ttest is better, smaller mann is better
t_ttest, _ = ttest_ind(neg_sims, pos_sims, equal_var=False)
t_mann, _  = mannwhitneyu(neg_sims, pos_sims, use_continuity=False)
print(t_ttest, t_mann)
plt.hist([pos_sims, neg_sims], np.linspace(0.0, 1.0, 100), label=['split', 'same'])
plt.legend(loc='upper right')
plt.show()

In [None]:
voyageai_embedder = get_voyageai_embedder()

In [None]:
# graph voyageai embeddings
pos_sims, neg_sims = \
        evaluate_embedder(train_sections, results['predictions'], voyageai_embedder)
# larger ttest is better, smaller mann is better
t_ttest, _ = ttest_ind(neg_sims, pos_sims, equal_var=False)
t_mann, _  = mannwhitneyu(neg_sims, pos_sims, use_continuity=False)
print(t_ttest, t_mann)
plt.hist([pos_sims, neg_sims], np.linspace(0.7, 1.0, 100), label=['split', 'same'])
plt.legend(loc='upper right')
plt.show()

In [None]:
print('neg mean', sum(neg_sims)/len(neg_sims), 'pos mean', sum(pos_sims)/len(pos_sims))

In [None]:
plt.hist([pos_sims, neg_sims], np.linspace(0.7, 1.0, 100), label=['split', 'same'])
plt.legend(loc='upper right')
plt.show()

In [None]:
# this takes a relatively long time and isn't as good as openai or mpnet embeddings

# results = evaluate(train_sections, predict_using_pairs(score_bert_wiki_paras_scorer, 0.75),
#                    debug=True)
# results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(openai_embedder, 0.78),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(openai_embedder, 0.80),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(openai_embedder, 0.83),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(voyageai_embedder, 0.81),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(voyageai_embedder, 0.82),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(voyageai_embedder, 0.83),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(voyageai_embedder, 0.84),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(voyageai_embedder, 0.85),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(voyageai_embedder, 0.87),
                   debug=True)
results['metrics']

In [None]:
# this is very expensive and isn't any better than openai or mpnet embeddings

# results = evaluate(train_sections, predict_using_embeddings(cohere_embedder, 3100.0),
#                    debug=True)
# results['metrics']

In [None]:
results = evaluate(train_sections, predict_using_embeddings(mpnet_embedder, 0.425),
                   debug=True)
results['metrics']

### Try syntactic features followed by embeddings
Use syntactic features to group some of the paragraphs,
then use embeddings to segment the rest

In [None]:
results = evaluate(train_sections, predict_using_features_and_embeddings(syntactic_paragraph_features, voyageai_embedder, 0.83),
                   debug=True)
results['metrics']

In [None]:
compare(train_sections, results['predictions'], 1)

### Greedy with max characters

In [None]:
results = evaluate(train_sections, predict_using_features_and_greedy_embeddings(syntactic_paragraph_features, voyageai_embedder, 0.83, 2000),
                   debug=True)
results['metrics']

In [None]:
compare(train_sections, results['predictions'], 1)

In [None]:
results = evaluate(train_sections, predict_using_features_and_greedy_embeddings(syntactic_paragraph_features, openai_embedder, 0.80, 2000),
                   debug=True)
results['metrics']

In [None]:
compare(train_sections, results['predictions'], 1)

### Run the two leading approaches on test data

In [None]:
results = evaluate(test_sections, predict_using_features_and_greedy_embeddings(syntactic_paragraph_features, voyageai_embedder, 0.83, 2000),
                   debug=True)
results['metrics']

In [None]:
results = evaluate(test_sections, predict_using_features_and_greedy_embeddings(syntactic_paragraph_features, openai_embedder, 0.80, 2000),
                   debug=True)
results['metrics']

### Split paragraphs

In [None]:
predict = predict_using_features_and_greedy_embeddings(syntactic_paragraph_features, voyageai_embedder, 0.83, 2000)

In [None]:
test_sections[0]

In [None]:
page_content = "\n\n".join([paragraph['text'] for paragraph in test_sections[0]['paragraphs']])

In [None]:
from models.split_utils import get_paragraph_texts_and_ids

paragraph_texts_and_ids = get_paragraph_texts_and_ids(page_content)
len(paragraph_texts_and_ids)

In [None]:
for text, _id in paragraph_texts_and_ids:
    print(_id, text)

In [None]:
paragraphs = [paragraph_text_id[0] for paragraph_text_id in paragraph_texts_and_ids]

In [None]:
splits = predict(paragraphs)

In [None]:
# predicted
print(splits)

In [None]:
# actual
print([paragraph['split'] for paragraph in test_sections[0]['paragraphs']])

In [None]:
from models.split_utils import get_split_texts_and_ids

split_texts_and_ids = get_split_texts_and_ids(
    paragraph_texts_and_ids,
    splits,
)

In [None]:
for text, id in split_texts_and_ids:
    print(id, text)
    print('*****')

## Train a custom segmentation classifier
Use scores from the top segmentation approaches above
as well as token and sentence counts to train an ensemble classifier

**This is a lot of effort and isn't as good as using VoyageAI with a threshold of 0.83**

In [None]:
pairs = get_labeled_pairs(train_sections, openai_embedder, voyageai_embedder, parser, syntactic_paragraph_features)
pair_df = pd.DataFrame(pairs)

In [None]:
pair_df.head(10)

In [None]:
X = pair_df.drop(['label'], axis=1)
y = pair_df['label']

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
# tune hyperparameters: LR or SVM

# Logistic Regression
clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000)
params = {
    'l1_ratio': uniform(0.0, 1.0),
    'C': loguniform(1e-2, 1e3),
}

# SVM
# clf = LinearSVC(dual=False, max_iter=10000)
# params = {
#     'C': loguniform(1e-6, 1e1),
# }

search = RandomizedSearchCV(
    clf,
    params,
    n_iter=100,
    scoring='f1',
    refit=False,
    verbose=1,
    n_jobs=8,
    cv=10)
search.fit(X_scaled, y)

In [None]:
print(search.best_params_)
print(search.best_score_)

In [None]:
# train clf over all training data

# wrap SVM in calibrated classifier CV to get probabilities
# svm = LinearSVC(dual=False, max_iter=10000, **search.best_params_)
# clf = CalibratedClassifierCV(svm, cv=10)
# clf.fit(X, y)

clf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, **search.best_params_)),
])

clf.fit(X, y)

In [None]:
print(clf.feature_names_in_)
print(clf['classifier'].coef_)
print(clf['classifier'].intercept_)

In [None]:
# evaluate ensemble on training data
results = evaluate(train_sections,
                   predict_using_features_and_ensemble(syntactic_paragraph_features,
                                                       openai_embedder,
                                                       mpnet_embedder,
                                                       parser,
                                                       clf,
                                                       0.55), debug=True)
results['metrics']

In [None]:
# evaluate ensemble on test data
results = evaluate(test_sections,
                   predict_using_features_and_ensemble(syntactic_paragraph_features,
                                                       openai_embedder,
                                                       mpnet_embedder,
                                                       parser,
                                                       clf,
                                                       0.55), debug=True)
results['metrics']

## Train over all data and save

In [None]:
all_pairs = get_labeled_pairs(talk_sections, openai_embedder, mpnet_embedder, parser, syntactic_paragraph_features)
all_pair_df = pd.DataFrame(pairs)

In [None]:
X = all_pair_df.drop(['label'], axis=1)
y = all_pair_df['label']

In [None]:
clf.fit(X, y)

In [None]:
print(clf.feature_names_in_)
print(clf['classifier'].coef_)
print(clf['classifier'].intercept_)

In [None]:
# save ensemble clf
filename = os.path.join(output_dir, f"{today}.pkl")
with open(filename,'wb') as f:
    pickle.dump(clf, f)

In [None]:
# evaluate ensemble on all data
results = evaluate(talk_sections,
                   predict_using_features_and_ensemble(syntactic_paragraph_features,
                                                       openai_embedder,
                                                       mpnet_embedder,
                                                       parser,
                                                       clf,
                                                       0.55), debug=True)
results['metrics']