# TASK CLASSIFICATION
* bring answers to the same task closer to each other
* bring answerst to different tasks further away

### evaluation
1. Classification (assign a response to the closest task centroid)
    * [x] cls resport
    * [x] average distance to the true task centroid
    * [x] average distance to the other task centroids
2. Vector space properties (organize all responses train+test and get an average)
    * [x] Average centroid distance between the task centroids
    * [x] Calinski-Harabasz cluster score for tasks

# LOADING LIBRARIES

In [3]:
import eval_utils
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn import metrics
from sentence_transformers import SentenceTransformer, losses, InputExample

2023-05-22 07:32:57.037422: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
SentenceTransformer("TurkuNLP/bert-base-finnish-cased-v1")

Some weights of the model checkpoint at /home/voskobe1/.cache/torch/sentence_transformers/TurkuNLP_bert-base-finnish-cased-v1 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

# SETTING UP THE SAMPLING FUNCTION

In [None]:
def make_cls_pairs_hard(dataset_df, label_column, embed_column, transcript_column, n=1, tasks_to_drop=[]):
    """
    Creates a list of InputExamples with hard positive and negative pairs for contrastive learning from the given dataset.

    Parameters
    ----------
    dataset_df : pd.DataFrame
        Dataframe containing the dataset.
    label_column : str
        Name of the column containing the class labels.
    embed_column : str
        Name of the column containing the embeddings.
    transcript_column : str
        Name of the column containing the transcripts.
    n : int, optional, default: 1
        Number of negative pairs to be selected for each sample.
    tasks_to_drop : list, optional, default: []
        List of tasks to be excluded from the dataset.

    Returns
    -------
    train_examples : list
        List of InputExamples containing hard positive and negative pairs for contrastive learning.
    """
    
    
    df = dataset_df.copy()
    df = df[~df['task'].isin(tasks_to_drop)]
    df.reset_index(inplace=True)
    
    all_positive_pairs = []
    all_negative_pairs = []
    
    label_centroids = eval_utils.get_label_centroids(df, embed_column, label_column)
    df['ids'] = [x for x in range(len(df))]
    
    
    for i, row in df.iterrows():
        # get sample
        sample_id = row['ids']
        sample = row[transcript_column]
        sample_label = row[label_column]
        sample_embed = row[embed_column]
        
        # get random positive pair
        pos_df = df[(df[label_column]==sample_label) & (df['ids']!=sample_id)].copy()
        if len(pos_df)!=0:
            pos_sample = pos_df.sample(n=1)[transcript_column].values[0]
            all_positive_pairs.append((sample, pos_sample))
        
        # get negative pairs
        # find examples from other tasks with min distance to our example
        neg_df = df[df[label_column]!=sample_label].copy()
        if len(neg_df)!=0:
            neg_embeds = np.vstack(neg_df[embed_column])
            neg_distances = metrics.pairwise.paired_cosine_distances(neg_embeds, [sample_embed]*len(neg_embeds))

            vector_ids = np.argsort(neg_distances) # inds of neg samples sorted by their proximity
            closest_negs = vector_ids[:n] # inds of n closest responses
            closest_negs_transcripts = neg_df[transcript_column].values[closest_negs]
            for ne in closest_negs_transcripts:
                    all_negative_pairs.append((sample, ne))
            
    train_examples=[]
    for p in all_positive_pairs:
        train_examples.append(InputExample(texts=[p[0], p[1]], label=1))
    for n in all_negative_pairs:
        train_examples.append(InputExample(texts=[n[0], n[1]], label=0))
      
    return train_examples

# THE TRAINING LOOP

In [None]:
df = pd.read_csv("../../data/finnish_average.csv") # loading the dataset
tasks_to_drop = [] # selecting the tasks to not include in training (will be still tested on)
model_name = "TurkuNLP/bert-base-finnish-cased-v1" # setting up the pre-trained model
model_folder = "FIN_TASK_MODELS_SIAM" # folder to save the model
num_epochs = 5

In [None]:
results_df = pd.DataFrame() # df to store predictions

# results of untrained model
all_predictions_pre = [] 
all_true_dist_pre = 0
all_other_dist_pre = 0
all_true = []
all_samples = []

In [None]:
for e in range(num_epochs):
    
    print("epoch "+str(e))
    # results per training fold
    all_predictions = []
    all_true_dist = 0
    all_other_dist = 0
    
    for split in df['split'].unique():
        print("----------------------------")
        print("split {}".format(split))
        #===========================================
        # SET UP
        # make saving path
        model_path = model_folder+"/epoch{}_split{}".format(e,split)

        train_df = df[df['split']!=split].reset_index(drop=True)
        test_df = df[df['split']==split].reset_index(drop=True)

        if e == 0:
            # adding values to the results_df
            split_true = test_df['task'].tolist()
            all_true+=split_true

            split_sample = test_df['sample'].tolist()
            all_samples+=split_sample

            # load untrained model
            model = SentenceTransformer(model_name, device=device)

            pre_emb_dict = eval_utils.get_embed_dict(df['clean_transcript'].unique().tolist(), model.eval())
            train_df['pre_training_embeds'] = [pre_emb_dict[sent] for sent in train_df['clean_transcript']]
            test_df['pre_training_embeds'] = [pre_emb_dict[sent] for sent in test_df['clean_transcript']]
            df['pre_training_embeds'] = [pre_emb_dict[sent] for sent in df['clean_transcript']]

            pre_y_pred_task, y_true_task, pre_average_true_dist, pre_average_other_dist = eval_utils.one_shot_cls('pre_training_embeds',
                                                                                                                  train_df,
                                                                                                                  test_df,
                                                                                                                  'task')
            all_predictions_pre+=pre_y_pred_task
            all_true_dist_pre+=pre_average_true_dist
            all_other_dist_pre+=pre_average_other_dist

        else:
            # load the model trained in the previous epoch
            pre_model_path = model_folder+"/epoch{}_split{}".format(e-1,split)
            model = SentenceTransformer(pre_model_path, device=device)
    
        #===========================================
        # TRAINING
        train_examples = make_cls_pairs_hard(train_df, "task", 'pre_training_embeds', "clean_transcript", n=5, tasks_to_drop=tasks_to_drop)
        print("PAIRS TO LEARN: ", len(train_examples))
        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

        train_loss = losses.ContrastiveLoss(model=model)

        model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=50)
        model.save(model_path)
        #===========================================
        # POST-TRAINING EVAL by FOLD

        emb_dict = eval_utils.get_embed_dict(df['clean_transcript'].unique().tolist(), model.eval())
        train_df['post_training_embeds'] = [emb_dict[sent] for sent in train_df['clean_transcript']]
        test_df['post_training_embeds'] = [emb_dict[sent] for sent in test_df['clean_transcript']]

        y_pred_task, y_true_task, average_true_dist, average_other_dist = eval_utils.one_shot_cls('post_training_embeds',
                                                                                                  train_df,
                                                                                                  test_df,
                                                                                                  'task')

        all_predictions+=y_pred_task
        all_true_dist+=average_true_dist
        all_other_dist+=average_other_dist

    #===========================================
    # POST-TRAINING EVAL by EPOCH
    if e==0:
        results_df['samples']=all_samples
        results_df['true']=all_true
        results_df['pre']=all_predictions_pre
        results_df['epoch0']=all_predictions
        
        print("--------------------------------------")
        print("PRE TRAINING")
        print("--------------------------------------")
        eval_utils.compute_task_scores(df, 'task','pre_training_embeds')
        print("DIST TO TRUE: ", all_true_dist_pre)
        print("DIST TO OTHER: ", all_other_dist_pre)
        print(metrics.classification_report(all_true, all_predictions_pre))
        eval_utils.plot_n_random_tasks(df, 'task', 'pre_training_embeds', n=10)
        
        
    else:
        results_df['epoch'+str(e)]=all_predictions
        
    print("--------------------------------------")
    print("POST EPOCH {}".format(e))
    print("--------------------------------------")
    print("DIST TO TRUE: ", all_true_dist)
    print("DIST TO OTHER: ", all_other_dist)
    print(metrics.classification_report(all_true, all_predictions))
    
    # some metrics for the last fold model
    df['last_fold_training_embeds'] = [emb_dict[sent] for sent in df['clean_transcript']]
    eval_utils.compute_task_scores(df, 'task','last_fold_training_embeds')
    eval_utils.plot_n_random_tasks(df, 'task', 'last_fold_training_embeds', n=10)
    
    results_df.to_csv(model_folder+"/results_cls.csv", index=False)
    cluster_df.to_csv(model_folder+"/cluster_df.csv", index=False)