In [None]:
!pip install -q transformers
!pip install -q datasets
!pip install -q sentence_transformers
!pip install -q umap
!pip install -q umap-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import torch
import random
import glob
import itertools
from datasets import Dataset
from scipy import stats
from torch.utils.data import DataLoader
from sklearn import metrics
from collections import Counter
from sentence_transformers import models, SentenceTransformer, losses, evaluation, InputExample

In [None]:
import sys
sys.path.append('/content/drive/MyDrive')
import eval_utils

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def make_cls_pairs_cosine_similarity(df, score_column, transcript_column, score_interval=[1,3], tasks_to_drop=[]):
    train_examples = []
    processed_pairs = set()
    
    df=df[~df['task'].isin(tasks_to_drop)]
    for i, row in df.iterrows():
        # Get sample
        sample = row[transcript_column]
        sample_task = row['task']
        sample_score = row[score_column]
        
        # Get pairs within the same task
        task_df = df[(df['task'] == sample_task) & (df[transcript_column] != sample)].copy()
        
        for j, pair_row in task_df.iterrows():
            pair_sample = pair_row[transcript_column]
            
            # Check if the pair has already been processed
            if (sample, pair_sample) in processed_pairs or (pair_sample, sample) in processed_pairs:
                continue

            pair_score = pair_row[score_column]
            
            # Calculate cosine similarity label based on scores
            score_diff = abs(sample_score - pair_score)
            max_diff = interval[1] - interval[0]
            cosine_similarity_label = 1.0 - (score_diff / max_diff) 
            
            train_examples.append(InputExample(texts=[sample, pair_sample], label=cosine_similarity_label))

            # Add the processed pair to the set
            processed_pairs.add((sample, pair_sample))
    
    return train_examples

In [None]:
df = pd.read_csv("drive/MyDrive/swedish_average.csv")
range_min = int(df['cefr_mean'].min())
range_max = int(df['cefr_mean'].max())
interval = [range_min, range_max]
num_bins = range_max-range_min+1
df['cefr_bins'] = eval_utils.get_hist_bin(df['cefr_mean'].tolist(), range_min, range_max)


df['cefr_round'] = [eval_utils.school_round(x) for x in df['cefr_mean']]
df = df[~df['cefr_round'].isin([1,6])]


model_name = "drive/MyDrive/SWE_TASK_MODELS_SIAM/epoch4_split{}"

tasks_to_drop = []
#tasks_to_drop = [1,13,3]
#tasks_to_drop = [23, 27, 15]

criterion_column = 'cefr_mean' #'ta_facets'
bin_column = 'cefr_round' #'ta_bins_r'
folder_name = "SWE_CEFR_MODELS_SIAM_COSINE_TASK"

results_df = pd.DataFrame()
all_true = []
all_samples = []

for e in range(5):
  print("epoch "+str(e))
  all_predictions = []
  pred_before_training = []
  for split in df['split'].unique():
    print("----------------------------")
    print("split {}".format(split))
    
    # make saving path
    model_path = "drive/MyDrive/"+folder_name+"/epoch{}_split{}".format(e,split)
    
    train_df = df[df['split']!=split].reset_index(drop=True)
    test_df = df[df['split']==split].reset_index(drop=True)

    if e == 0:
      # add values to the df
      split_true = test_df[criterion_column].tolist()
      all_true+=split_true
      
      split_sample = test_df['sample'].tolist()
      all_samples+=split_sample

      # load untrained model
      model = SentenceTransformer(model_name.format(split), device=device)

      pre_emb_dict = eval_utils.get_embed_dict(df['clean_transcript'].unique().tolist(), model)
      train_df['pre_training_embeds'] = [pre_emb_dict[sent] for sent in train_df['clean_transcript']]
      test_df['pre_training_embeds'] = [pre_emb_dict[sent] for sent in test_df['clean_transcript']]
      df['pre_training_embeds'] = [pre_emb_dict[sent] for sent in df['clean_transcript']]

          
      y_pred_ta = eval_utils.get_bert_n_closest_score(train_df,
                                                      test_df,
                                                      "pre_training_embeds",
                                                      criterion_column)
      pred_before_training+=y_pred_ta
      
    else:
      # load trained in previous epoch
      pre_model_path = "drive/MyDrive/"+folder_name+"/epoch{}_split{}".format(e-1,split)
      model = SentenceTransformer(pre_model_path, device=device)
    
    emb_dict = eval_utils.get_embed_dict(train_df['clean_transcript'].unique().tolist(), model)
    train_df['pre_training_embeds'] = [emb_dict[sent] for sent in train_df['clean_transcript']]
    train_examples = make_cls_pairs_cosine_similarity(train_df, criterion_column, "clean_transcript", score_interval=interval)
    #random.shuffle(train_examples)
    #train_examples = train_examples[:200]

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

    train_loss = losses.CosineSimilarityLoss(model=model)
    
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=50)
    model.save(model_path)
    
    emb_dict = eval_utils.get_embed_dict(df['clean_transcript'].unique().tolist(), model)
    train_df['post_training_embeds'] = [emb_dict[sent] for sent in train_df['clean_transcript']]
    test_df['post_training_embeds'] = [emb_dict[sent] for sent in test_df['clean_transcript']]
    df['post_training_embeds'] = [emb_dict[sent] for sent in df['clean_transcript']]
    
    y_pred_ta = eval_utils.get_bert_n_closest_score(train_df,
                                                    test_df,
                                                    "post_training_embeds",
                                                    criterion_column)

    all_predictions+=y_pred_ta

  
  if e==0:
    results_df['samples']=all_samples
    results_df['true']=all_true
    results_df['pre']=pred_before_training
    results_df['epoch0']=all_predictions

    print("PRE TRAINING 1NN")
    eval_utils.evaluate_cls(all_true, pred_before_training)
    print('----------')
    eval_utils.evaluate_reg(all_true, pred_before_training, "pre")
    eval_utils.compute_bin_scores(df, 'pre_training_embeds', bin_column)
    eval_utils.plot_subtask(df, 'task', 1, 'pre_training_embeds', criterion_column)


  else:
    results_df['epoch'+str(e)]=all_predictions
  
  print("POST TRAINING 1NN")
  eval_utils.evaluate_cls(all_true, all_predictions)
  print('----------')
  eval_utils.evaluate_reg(all_true, all_predictions, "sbert 1nn epoch {}".format(e))
  print('----------')
  eval_utils.compute_task_scores(df, 'task','post_training_embeds')
  print('----------')
  eval_utils.compute_bin_scores(df, 'post_training_embeds', bin_column)
  eval_utils.plot_n_random_tasks(df, 'task', 'post_training_embeds', n=10)
  eval_utils.plot_subtask(df, 'task', 1, 'post_training_embeds', criterion_column)
  #-----------------------------------------

  results_df.to_csv("drive/MyDrive/"+folder_name+"/results_cls.csv", index=False)