#### Basic Setup

In [None]:
# install libraries
# I assume that standard libraries such as torch, pandas, numpy etc are installed
!pip install -q transformers
!pip install -q sentence-transformers
# if no gpu is available use install faiss-cpu instead
!pip install -q faiss-gpu

In [None]:
from inference_model import InferenceModel
from utils.misc import get_random_sents, store
from utils.metrics import polarity_score as PS, semantic_similarity_score as SSS
from utils.example_generation import generate_training_examples
from utils.visualizations import cosine_pca_2d

In [None]:
N_SENTENCES = 100000    # number of sentences to include in experiments, if this value exceeds size of dataset, simply use the entire dataset
N_TRAIN_SENTS = 5000    # how many sentences are sampled to generate examples
N_TRAIN_QUERY = 64      # how many neighbours are sampled for example generation

K_EVAL = 64             # how many neighbours are sampled during evaluation of a model

#### Loading the dataset and create sarcasm lookup

In [None]:
# load the dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

headlines_df = pd.read_csv("data/dataset.csv")
lst = list(headlines_df.itertuples(index=False, name=None))

# shuffle but do it in a reproducible way
np.random.seed(0)
np.random.shuffle(lst)

# reduce dataset size
lst = lst[:N_SENTENCES]

sents, labels = zip(*lst)

X_train, X_test, y_train, y_test = train_test_split(sents, labels, test_size=0.5, random_state=42)

# concatenate them again so we get a global list we can use for lookup
test_offset = len(X_train)
sents, labels = X_train + X_test, y_train + y_test

train_sarc = np.sum(y_train)
train_nonsarc = len(y_train) - train_sarc
test_sarc = np.sum(y_test)
test_nonsarc = len(y_test) - test_sarc
sarc_perc = np.sum(labels) / len(labels) * 100
nonsarc_perc = 100 - sarc_perc

print("Data Overview:")

print("\t\t", "train", "\t", "test", "\t", "total", "\t", "in %", sep="")
print("sarcastic", "\t", train_sarc, "\t", test_sarc, "\t", np.sum(labels), "\t", f"{sarc_perc:.2f}", sep="")
print("non-sarcastic", "\t", train_nonsarc, "\t", test_nonsarc, "\t", len(labels) - np.sum(labels), "\t", f"{nonsarc_perc:.2f}", sep="")
print("\t\t", len(y_train), "\t", len(y_test), "\t", len(labels), "\t", "100.0", sep="")
print("")
print(f"Sarcasm % in train data: {(train_sarc / len(y_train) * 100):.2f}")
print(f"Sarcasm % in test data: {(test_sarc / len(y_test) * 100):.2f}")

In [None]:
# create lookups

sent_to_index = {sent: index for index, sent in enumerate(sents)}

def is_sarcastic(sent_or_index):
    if isinstance(sent_or_index, str):
        index = sent_to_index[sent_or_index]
        return labels[index]
      
    # assume that it is an int or numpy.int64
    return labels[int(sent_or_index)]

#### Load reference model

In [None]:
# the baseline sentence embedding model
# the starting point for each fine-tuning attempt
from sentence_transformers import SentenceTransformer
standard_miniLM = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# the baseline inference model
reference_model_train = InferenceModel(name="Reference Model (train)", model=standard_miniLM, sentences=X_train)
reference_model_test = InferenceModel(name="Reference Model (test)", model=standard_miniLM, sentences=X_test)

#### Defining the models

In [None]:
# define models with their respective loss functions

from sentence_transformers.losses import TripletLoss, TripletDistanceMetric, \
    MultipleNegativesRankingLoss, ContrastiveLoss, OnlineContrastiveLoss

emb_models_setup = [
    {
        "name": "Triplet(lambda=5)",
        "loss": TripletLoss,
        "loss_params": {
            "distance_metric": TripletDistanceMetric.COSINE,
            "triplet_margin": 5 # the default value
        },
        "train_example_dropout": 0.98,
    },
    {
        "name": "Triplet(lambda=2)",
        "loss": TripletLoss,
        "loss_params": {
            "distance_metric": TripletDistanceMetric.COSINE,
            "triplet_margin": 2
        },
        "train_example_dropout": 0.98,
    },
    {
        "name": "Triplet(lambda=0.5)",
        "loss": TripletLoss,
        "loss_params": {
            "distance_metric": TripletDistanceMetric.COSINE,
            "triplet_margin": 0.5
        },
        "train_example_dropout": 0.98,
    },
    {
        "name": "Triplet(lambda=0.1)",
        "loss": TripletLoss,
        "loss_params": {
            "distance_metric": TripletDistanceMetric.COSINE,
            "triplet_margin": 0.1
        },
        "train_example_dropout": 0.98,
    },
    {
        "name": "Triplet(lambda=0.05)",
        "loss": TripletLoss,
        "other_params": {
            "distance_metric": TripletDistanceMetric.COSINE,
            "triplet_margin": 0.05
        },
        "train_example_dropout": 0.98,
    },
    {
        "name": "Triplet(lambda=0.01)",
        "loss": TripletLoss,
        "loss_params": {
            "distance_metric": TripletDistanceMetric.COSINE,
            "triplet_margin": 0.01
        },
        "train_example_dropout": 0.98,
    },
    {
        "name": "MultipleNegatives", 
        "loss": MultipleNegativesRankingLoss,
        "train_example_dropout": 0.7
    },
    {
        "name": "Contrastive(lambda=0.5)",
        "loss": ContrastiveLoss,
        "loss_params": {
            "margin": 0.5 # the default value
        },
        "train_example_dropout": 0.8,
    },
    {
        "name": "Contrastive(lambda=0.2)",
        "loss": ContrastiveLoss,
        "loss_params": {
            "margin": 0.2
        },
        "train_example_dropout": 0.8,
    },
    {
        "name": "OnlineContrastive(lambda=0.5)",
        "loss": OnlineContrastiveLoss,
        "loss_params": {
            "margin": 0.5 # the default value
        },
        "train_example_dropout": 0.8,
    }
]

In [None]:
# prepare embedding models by initializing with baseline model and setup of loss functions

def get_baseline():
    return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

emb_base_models = {}
for model in emb_models_setup:
    base_model = get_baseline()
    
    loss_params = model.get("loss_params", {})
    loss = model["loss"](model=base_model, **loss_params)
    
    dropout = model["train_example_dropout"]
    emb_base_models[model["name"]] = {
        "loss": loss,
        "model": base_model,
        "train_examples": None,
        "train_example_dropout": dropout
    }

In [None]:
# select the models we want to actually include in training
# comment out those that should be ignored
training_selection = [
    'Triplet(lambda=5)', 
    'Triplet(lambda=2)', 
    'Triplet(lambda=0.5)', 
    'Triplet(lambda=0.1)', 
    'Triplet(lambda=0.05)', 
    'Triplet(lambda=0.01)', 
    'MultipleNegatives', 
    'Contrastive(lambda=0.5)', 
    'Contrastive(lambda=0.2)', 
    'OnlineContrastive(lambda=0.5)'
]

emb_models = {k: v for k, v in emb_base_models.items() if k in training_selection}

f"Training and Evaluation of {len(emb_models.items())} embeddings -> indexing models"

#### Helper functions for the main train/eval loop and the loops implementation

In [None]:
from tqdm.notebook import tqdm

def split_similar_sents_by_polarity(sents, n_query=64):
    """
        gets a list of sentences as input
        returns a dictionary with lists of indices of sentences (indices relative to the similarity_models index)
        that are similar and sarcastic and those that are similar and non-sarcastic
    """
    n = len(sents)
    batch_size = 128

    # fetch similar sentences
    batch_results = []
    n_iters = n//batch_size + 1

    for i in tqdm(range(n_iters), desc="Fetch similar sentences", leave=False):
        batch = sents[i*batch_size:(i+1)*batch_size]
        retrieved_sents = reference_model_train.query(batch, k=n_query)
        batch_results.append(retrieved_sents)
    
    similar_sents = np.concatenate(batch_results)

    # split by polarity
    split_data = [{} for i in range(n)]
    for i in tqdm(range(n), desc="Split by class", leave=False):
        # make sure that we do not include the sentence itself in the retrieved sentences
        sarc = [sent for sent in similar_sents[i] if is_sarcastic(sent)]
        non_sarc = [sent for sent in similar_sents[i] if not is_sarcastic(sent)]
        split_data[i] = {
            "sentence": sents[i],
            "label": is_sarcastic(sents[i]),
            "sarc": sarc,
            "non_sarc": non_sarc
        }
        
    return split_data

In [None]:
def generate_training_data(emb_models, num_examples=N_TRAIN_SENTS, n_query=N_TRAIN_QUERY, random_seed=None):
    if isinstance(random_seed, int):
        np.random.seed(random_seed)
        
    train_data_origin = get_random_sents(X_train, num_examples)
    split_data = split_similar_sents_by_polarity(train_data_origin, n_query=n_query)
    
    for model_name, params in emb_models.items():
        # do not pass a random seed since we just seeded
        dropout = params.get("train_example_dropout", 0)
        examples = generate_training_examples(split_data, model_name, dropout=dropout, random_seed=random_seed)
        emb_models[model_name]["train_examples"] = examples

        print(f"Model '{model_name}' now has {len(examples)} training examples.") 

In [None]:
from torch.utils.data import DataLoader

def train_models(emb_models, n_epochs):
    for model_name, params in emb_models.items():
        print(f"Training model '{model_name}'")
        
        dataloader = DataLoader(params["train_examples"], shuffle=True, batch_size=64)
        loss = params["loss"]
        
        params["model"].fit(train_objectives=[(dataloader, loss)], epochs=n_epochs)

In [None]:
def evaluate_model_on_sentence(sentence, model, reference_model, k=K_EVAL, use_discounting=False):
    y = is_sarcastic(sentence)

    model_suggestions = [s for s in model.query(sentence, k=k) if s != sentence]
    ys_model = [is_sarcastic(s) for s in model_suggestions]
    
    polarity_score = PS(y, ys_model, use_discounting)
    similarity_score = SSS(sentence, model_suggestions, reference_model, use_discounting)
    
    return polarity_score, similarity_score

In [None]:
def evaluate_model(model, sents, k=K_EVAL, use_discounting=False):
    sents_eval = [evaluate_model_on_sentence(sent, model, reference_model_test, k=k, use_discounting=use_discounting) for sent in sents]
    polarity_score, similarity_score = [np.mean(tup) for tup in zip(*sents_eval)]
    
    # "overfitting" indicators
    # how similar does the model itself think that its predictions are
    # if this value is high, while similarity_score is low, this is a good indication
    # that our model is "blindly trying to make the loss function happy" and similarity is no longer important for it
    model_view = [evaluate_model_on_sentence(sent, model, model, k=k, use_discounting=use_discounting) for sent in sents]
    _, model_score = [np.mean(tup) for tup in zip(*model_view)]
    
    return polarity_score, similarity_score, model_score

In [None]:
def evaluate_baseline(test_sents, k, use_discounting):
    sents_eval = [evaluate_model_on_sentence(sent, reference_model_test, reference_model_test, k=k, use_discounting=use_discounting) for sent in test_sents]
    polarity_score, similarity_score = [np.mean(tup) for tup in zip(*sents_eval)]
    baseline_metrics = {
        "polarity_score": polarity_score,
        "similarity_score": similarity_score
    }
    
    return baseline_metrics

In [None]:
def get_raw_snapshot(model, test_sents, k):
    model_suggestions = [model.query(sent, k=k) for sent in test_sents]
    raw_data = []
    for i, sent in enumerate(test_sents):
        emb = model.embed(sent)
        suggestions = model_suggestions[i]
        suggestion_embs = model.embed(suggestions)

        X = np.append(suggestion_embs, [emb], 0)
        # to save space we perform PCA to reduce the number of dimensions from 384 to 2 in order to save disk space later
        X = cosine_pca_2d(X)
        
        raw_data.append({
            "sentence": sent,
            "embedding_2d": X[-1].tolist(),
            "suggestions": suggestions,
            "suggestion_embeddings_2d": X[:-1].tolist()
        })
        
    return raw_data

In [None]:
# The main train/evaluation loop is implemented in this function

def evaluation(
      emb_models, 
      iters: int = 10, 
      measure_every: int = 1, 
      use_discounting=False, 
      initial_train_data = None,
      new_train_data_per_iter=False
    ):
  
    print(f"Evaluation of {len(emb_models)} embeddings (and their resulting inference models) started.")
    
    test_sents = get_random_sents(X_test, n=100, random_seed=0)
    
    k = K_EVAL

    # baseline performance
    baseline_metrics = evaluate_baseline(test_sents, k, use_discounting)
    baseline_raw_data = get_raw_snapshot(reference_model_test, test_sents, k)
    
    model_records = {
        model_name: [{
            "epoch": 0,
            "polarity_score": baseline_metrics["polarity_score"],
            "similarity_score": baseline_metrics["similarity_score"],
            "overfitting_indicator": None,
            "raw_data": baseline_raw_data
        }] for model_name in emb_models.keys()
    }
    
    print("Starting train/eval loop") 
    for i in tqdm(range(iters), desc="Train/Eval iterations"):
        if i > 0: print("\n")
        print("#"*50)
        print(f"### Evaluation Iteration: {i+1}")
        print("#"*50)
        
        # maybe generate new training data here
        if (not initial_train_data and i == 0) or (new_train_data_per_iter and i != 0):
            print("Generating training data.")
            generate_training_data(emb_models, random_seed=i)
            
        # train the models for "measure_every" epochs
        print("Training models")
        train_models(emb_models, n_epochs=measure_every)
        
        # build the resulting inference models
        print("Build inference models.")
        inference_models = {
            model_name: InferenceModel(name=model_name, model=params["model"], sentences=X_test) 
                    for model_name, params in emb_models.items()
        }
        
        # evaluate the trained models
        epoch = (i+1) * measure_every

        print("@"*10, f"model scoring after {epoch} epochs", "@"*10)
        for model_name, model in tqdm(inference_models.items(), leave=False, desc="Evaluate models"):
            print(f"Model '{model_name}':")
            
            # compute scores
            polarity_score, similarity_score, model_score = evaluate_model(model, test_sents, k=k, use_discounting=use_discounting)
            raw_data = get_raw_snapshot(model, test_sents, k)
            
            model_records[model_name].append({
                "epoch": (i+1) * measure_every,
                "polarity_score": polarity_score,
                "similarity_score": similarity_score,
                "overfitting_indicator": model_score, 
                "raw_data": raw_data 
            })
                
            print("=== Computed Metrics ===")
            print(f"Polarity Score: {polarity_score:.4f}")
            print(f"Similarity Score: {similarity_score:.4f}")
            print(f"Overfitting Indicator: {model_score:.4f} compared to {baseline_metrics['similarity_score']:.4f} in reference model")

    return baseline_metrics, model_records

#### Evaluate models

In [None]:
N_ITERS = 10        # number of train/eval iterations
MEASURE_EVERY = 1   # how many "epochs" to train the models on a given exampleset

baseline_metrics, model_records = evaluation(
    emb_models=emb_models, 
    iters=N_ITERS, 
    measure_every=MEASURE_EVERY, 
    new_train_data_per_iter=True, 
    use_discounting=True
)

In [None]:
# create output folder if it doesnt exist yet
from pathlib import Path
Path("./records").mkdir(parents=True, exist_ok=True)

# store recorded metrics with a unique eval id so we dont overwrite previous records
eval_id = "0001"

store(f"./records/{eval_id}_baseline_metrics.json", baseline_metrics)
store(f"./records/{eval_id}_model_records.json", model_records)