In [None]:
!pip install -q transformers
!pip install -q datasets
!pip install -q sentence_transformers
!pip install -q umap
!pip install -q umap-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import torch
import os
from scipy import stats
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn import metrics
from collections import Counter
from sentence_transformers import models, SentenceTransformer
from transformers import PreTrainedModel, BertConfig, BertModel, AutoTokenizer, AutoModel,AutoConfig, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
import sys
sys.path.append('/content/drive/MyDrive')
import eval_utils

In [None]:
class BertForSequenceClassificationMeanPooling(PreTrainedModel):
  def __init__(self, bert_model, num_labels):
      super().__init__(bert_model.config)
      self.bert = bert_model
      self.num_labels = num_labels
      self.config.num_labels = num_labels
      self.dropout = torch.nn.Dropout(0.1)
      self.classifier = torch.nn.Linear(bert_model.config.hidden_size, num_labels)

  def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
      outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
      last_hidden_state = outputs.last_hidden_state

      input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
      sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
      sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
      mean_embeddings = sum_embeddings / sum_mask

      pooled_output = self.dropout(mean_embeddings)
      logits = self.classifier(pooled_output)

      result = {"logits": logits}
      if labels is not None:
          loss_fct = torch.nn.CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
          result["loss"] = loss

      return result
    
  @classmethod
  def from_my_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):

      print("loading pre-trained")
      # Load the configuration from the saved model
      config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
      # Load the saved state_dict
      state_dict = torch.load(f"{pretrained_model_name_or_path}/pytorch_model.bin")
      # Derive num_labels from the state_dict
      num_labels = state_dict["classifier.weight"].size(0)
      # Initialize the BERT model
      bert_model = AutoModel.from_pretrained(pretrained_model_name_or_path, config=config)
      # Create a new instance of the class
      model = cls(bert_model, num_labels)
      # Update the model weights
      model.load_state_dict(state_dict)

      return model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def preprocess_function(examples):
    return bert_tokenizer(examples['clean_transcript'], padding=True, truncation=True, max_length=512)

In [None]:
def make_split_dataset(df, split, label_column):
  df['label'] = df[label_column]-1
  df = df[['sample','clean_transcript', 'label', 'split']].copy()
  
  test_df = df[df['split']==split].reset_index(drop=True)
  dataset_test = Dataset.from_pandas(test_df).map(preprocess_function, batched=True)
  
  train_df = df[df['split']!=split]
  train_df = train_df.sample(frac=1).reset_index(drop=True)
  dataset_train = Dataset.from_pandas(train_df).map(preprocess_function, batched=True)

  new_dataset_train = dataset_train.remove_columns(['clean_transcript', 'split'])
  new_dataset_test = dataset_test.remove_columns(['clean_transcript', 'split'])

  return new_dataset_train, new_dataset_test


In [None]:
def make_split_dataset_no_task(df, split, label_column, tasks_to_exclude):

  train_df = df[df['split'] != split]
  train_df = train_df[~train_df['task'].isin([1, 3, 13])]
  train_df['label'] = train_df[label_column]-1
  train_df = train_df[['sample','clean_transcript', 'label']]

  df['label'] = df[label_column]-1
  df = df[['sample','clean_transcript', 'label', 'split']].copy()
  
  test_df = df[df['split']==split].reset_index(drop=True)
  dataset_test = Dataset.from_pandas(test_df).map(preprocess_function, batched=True)
  
  train_df = train_df.sample(frac=1).reset_index(drop=True)
  dataset_train = Dataset.from_pandas(train_df).map(preprocess_function, batched=True)

  new_dataset_train = dataset_train.remove_columns(['clean_transcript'])
  new_dataset_test = dataset_test.remove_columns(['clean_transcript', 'split'])

  return new_dataset_train, new_dataset_test

In [None]:
training_args = TrainingArguments(
    output_dir="yo",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    save_total_limit=1,
    warmup_steps=0,
    evaluation_strategy="epoch",
    logging_steps = 100)

In [None]:
df = pd.read_csv("drive/MyDrive/swedish_average.csv")
model_name = "drive/MyDrive/SWE_TASK_MODELS_SIAM/epoch4_split{}"

bert_tokenizer = AutoTokenizer.from_pretrained("drive/MyDrive/SWE_TASK_MODELS_SIAM/epoch4_split0")
data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer)

criterion_column = 'task_completion_mean' #'ta_facets'
bin_column = 'ta_bins' #'ta_bins_r'
folder_name = "SWE_TA_MODELS_MEAN_TASK"

results_df = pd.DataFrame()
all_true_cls = []
all_true_reg = []
all_samples = []

for e in range(10):
  print("epoch "+str(e))
  all_predictions_cls = []
  all_predictions_knn = []
  
  for split in df['split'].unique():
    print("----------------------------")
    print("split {}".format(split))
    
    # make saving path
    model_path = "epoch{}_split{}".format(e,split)
    saving_path = "drive/MyDrive/"+folder_name+"/"+model_path
    
    train_df = df[df['split']!=split].reset_index(drop=True)
    test_df = df[df['split']==split].reset_index(drop=True)
    
    dataset_train, dataset_test = make_split_dataset(df, split, bin_column)

    if e == 0:
      # add values to the df
      split_true_cls = df[df['split']==split][bin_column].tolist()
      split_true_reg = df[df['split']==split][criterion_column].tolist()
      all_true_cls+=[x-1 for x in split_true_cls]
      all_true_reg+=split_true_reg
      
      split_sample = df[df['split']==split]['sample'].tolist()
      all_samples+=[s for s in split_sample]

      # load untrained model
      bert_model = BertModel.from_pretrained(model_name.format(split))
      model = BertForSequenceClassificationMeanPooling(bert_model, num_labels=3)
      model.to(device)

    else:
      # load trained in previous epoch
      pre_model_path = "drive/MyDrive/"+folder_name+"/epoch{}_split{}".format(e-1,split)
      model = BertForSequenceClassificationMeanPooling.from_my_pretrained(pre_model_path)
      model.to(device)
    
    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset_train,
      eval_dataset=dataset_test,
      tokenizer=bert_tokenizer,
      data_collator=data_collator)
    
    trainer.train()
    model.save_pretrained(saving_path)
    
    model.eval()
    dataloader = DataLoader(dataset_test, batch_size=32, collate_fn=data_collator)

    split_predictions = []
    for batch in dataloader:
      with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        logits = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)["logits"]
        batch_predictions = torch.argmax(logits, dim=1).tolist()
        split_predictions+=batch_predictions
    
    all_predictions_cls+=split_predictions

    mean_embed_dict = eval_utils.get_mean_dict(df['clean_transcript'], model.bert, bert_tokenizer, device)
    train_df['post_training_embeds'] = [mean_embed_dict[sent] for sent in train_df['clean_transcript']]
    test_df['post_training_embeds'] = [mean_embed_dict[sent] for sent in test_df['clean_transcript']]
    df['post_training_embeds'] = [mean_embed_dict[sent] for sent in df['clean_transcript']]
        
    y_pred_ta_split = eval_utils.get_bert_n_closest_score(train_df,
                                                          test_df,
                                                          "post_training_embeds",
                                                          criterion_column)
    all_predictions_knn+=y_pred_ta_split

  
  if e==0:
    results_df['samples']=all_samples
    results_df['true_bins']=all_true_cls
    results_df['true_scores']=all_true_reg
    results_df['epoch0_bins']=all_predictions_cls
    results_df['epoch0_scores']=all_predictions_knn
  else:
    results_df['epoch'+str(e)+"_bins"]=all_predictions_cls
    results_df['epoch'+str(e)+"_scores"]=all_predictions_knn
  
  print("POST TRAINING CLS")
  eval_utils.evaluate_cls([x+1 for x in all_true_cls], [x+1 for x in all_predictions_cls])
  eval_utils.evaluate_reg([x+1 for x in all_true_cls], [x+1 for x in all_predictions_cls], "sbert mean cls epoch {}".format(e))
  
  print("POST TRAINING 1NN")
  eval_utils.evaluate_cls(all_true_reg, all_predictions_knn)
  print('----------')
  eval_utils.evaluate_reg(all_true_reg, all_predictions_knn, "sbert mean 1nn epoch {}".format(e))
  print('----------')
  eval_utils.compute_bin_scores(df, 'post_training_embeds', bin_column)
  eval_utils.plot_n_random_tasks(df, 'task', 'post_training_embeds', n=10)
  eval_utils.plot_subtask(df, 'task', 1, 'post_training_embeds', criterion_column)
  #-----------------------------------------

  results_df.to_csv("drive/MyDrive/"+folder_name+"/results_cls.csv", index=False)