## Installing Sentence Transsformer and other models/frameworks

In [None]:
!pip install sentence_transformers

# Kindly add all your installations and versions if any in this cell.

## Importing necessary libraries. 
In the final version all imports should be stricly enlisted here.

In [1]:
import pandas as pd
import numpy as np
import spacy
from scipy import stats
from sklearn import linear_model

import warnings
warnings.filterwarnings("ignore")

from sentence_transformers import SentenceTransformer, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

import torch 
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm
2022-11-28 23:15:58.572680: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-28 23:15:58.979572: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-28 23:16:00.167356: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-28 23:16:00.167485: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7

## Load dataset: 7 marks
1 Download and unzip the dataset from this link http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz  **1 mark**

2 Complete the code in `read_sts_csv()`. **4.5 marks**

3 Create 3 dataframes one each for train, dev and val and print their final shapes. **1.5 marks**

In [2]:
def read_sts_csv(dataset_type="train", columns=['source', 'type', 'year', 'id', 'score', 'sent_a', 'sent_b']):
  path = './Stsbenchmark/stsbenchmark/' + "sts-"+ dataset_type + ".csv"
  """
  Take the input path and return the dataframe
  """

  return pd.read_csv(path, sep='\\t', names=columns, on_bad_lines = 'skip')

  

df_train = read_sts_csv('train') # create the train, dev and test dataframes
df_test = read_sts_csv('test')
df_dev = read_sts_csv('dev')

## Hyperparameters: 5 Marks
Update this cell with you choosen parameters except, NUM_EPOCHS

In [3]:
import fasttext.util

NON_CONEXTUAL_MODEL_TYPE = fasttext.load_model('cc.en.300.bin')

CONEXTUAL_MODEL_TYPE = models.Transformer('distilroberta-base')

HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL = SentenceTransformer('all-MiniLM-L6-v2')# USE THE HUGGAING FACE VERSION OF SENTENCE_TRANSFORMER_TYPE
# INPUT_PATH = <INPUT_FOLDER_PATH>
BATCH_SIZE = 16
OUT_DIM_DENSE = 256
NUM_EPOCHS = 3 ## THIS IS FIXED DO NOT CHANGE

# You are free to add your own hyperparameters as well.

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## CONFIGURATION 1: Non-contextual Embeddings + ML Regression: 8 marks
1 Load the non-contextual embedding model in variable `non_cont_model1`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model1()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model1`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and dev set predictions using trained `model1`. **1.5 mark**



In [4]:
def get_feature_model1(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using non_cont_model1,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  df_sent = data_frame[["sent_a", "sent_b"]]
  list_A, list_B = [], []
  for row in range(len(df_sent)):
    list_A.append(NON_CONEXTUAL_MODEL_TYPE.get_sentence_vector(df_sent.loc[row, "sent_a"]))
    list_B.append(NON_CONEXTUAL_MODEL_TYPE.get_sentence_vector(df_sent.loc[row, "sent_b"]))

  return (np.array(list_A), np.array(list_B))  

 
## Non contextual language model
feature_1_train, feature_2_train = get_feature_model1(df_train)
X_train_tup, Ytrain = np.array(list(zip(feature_1_train, feature_2_train))), np.array(df_train["score"])
X_train = np.reshape(X_train_tup, (len(df_train), 600)) # as fastext return sentence vectors of size 300, np.reshape
                                                        # concatenates sent_a and sent_b to 600 length vector 

In [5]:
# Initiate a regression model and train it.
import numpy as np
from scipy import stats
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
clf = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
clf.fit(X_train, Ytrain)

In [6]:
# # Print spearman correlation on the predicted output of the dev set.
feature_1_dev, feature_2_dev = get_feature_model1(df_dev)
X_dev_tup, Ydev = np.array(list(zip(feature_1_dev, feature_2_dev))), np.array(df_dev["score"])
X_dev = np.reshape(X_dev_tup, (len(df_dev), 600))
Y_pred_dev = clf.predict(X_dev)
spear_dev = stats.spearmanr(Ydev, Y_pred_dev)
print("Spearman Score on dev: " + str(spear_dev[0]))

Spearman Score on dev: 0.28387182507452213


In [7]:
# # Print spearman correlation on the predicted output of the test sets.
feature_1_test, feature_2_test = get_feature_model1(df_test)
X_test_tup, Ytest = np.array(list(zip(feature_1_test, feature_2_test))), np.array(df_test["score"])
X_test = np.reshape(X_test_tup, (len(df_test), 600))
Y_pred_test = clf.predict(X_test)
spear_test = stats.spearmanr(Ytest, Y_pred_test)
print("Spearman Score on test: " + str(spear_test[0]))

Spearman Score on test: 0.47763114660927947


## CONFIGURATION 2: Contextual Embeddings + ML Regression: 7 marks
1 Load the contextual embedding model in variable `non_cont_model2`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model2()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model2`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model2`. **1.5 mark**

Useful references: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [10]:
def get_feature_model2(data_frame: pd.DataFrame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using model2,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  
  sent_list_A = data_frame["sent_a"].to_list()
  sent_list_B = data_frame["sent_b"].to_list()

  embeddings1 = HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL.encode(sent_list_A, convert_to_tensor=True)
  embeddings2 = HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL.encode(sent_list_B, convert_to_tensor=True)
  embeddings1 = embeddings1.cpu()
  embeddings2 = embeddings2.cpu()
  return (np.array(embeddings1), np.array(embeddings2))



feature_1_train_cont, feature_2_train_cont = get_feature_model2(df_train)
X_train_cont_tup, Y_train_cont = np.array(list(zip(feature_1_train_cont, feature_2_train_cont))), np.array(df_train["score"])


In [11]:
## Support vector regressor model created
X_train_cont = np.reshape(X_train_cont_tup, (len(df_train), (384*2)))   # each embedding is of size 384
clf_cont = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
clf.fit(X_train_cont, Y_train_cont)

In [12]:
# print spearman score on dev set
feature_1_dev_cont, feature_2_dev_cont = get_feature_model2(df_dev)
X_dev_tup_cont, Ydev = np.array(list(zip(feature_1_dev_cont, feature_2_dev_cont))), np.array(df_dev["score"])
X_dev_cont = np.reshape(X_dev_tup_cont, (len(df_dev), 768))
Y_pred_dev_cont = clf.predict(X_dev_cont)
spear_dev = stats.spearmanr(Ydev, Y_pred_dev_cont)
print("Spearman Score on dev: " + str(spear_dev[0]))

Spearman Score on dev: 0.4694088468776907


In [13]:
# print spearman score on test set
feature_1_test_cont, feature_2_test_cont = get_feature_model2(df_test)
X_test_tup_cont, Ytest = np.array(list(zip(feature_1_test_cont, feature_2_test_cont))), np.array(df_test["score"])
X_test_cont = np.reshape(X_test_tup_cont, (len(df_test), 768))
Y_pred_test_cont = clf.predict(X_test_cont)
spear_test = stats.spearmanr(Ytest, Y_pred_test_cont)
print("Spearman Score on test: " + str(spear_test[0]))

Spearman Score on test: 0.576822197624574


## CONFIGURATION 3: Fine-Tune a Contextual Embeddings Model: 18 marks
1 Prepare data samples to be for the DL model to consume. Add the code in the `form_data()`. **4 marks**

3 Create the data loader, one each for train/dev/test data_input sample set obtained from `form_input_example()`. **1.5 marks**

4 Initiate `model3` consisting of **atleast** the following 3 components - `base_LM`, a `pooling_layer` and a `dense_layer`. Use appropriate activation function in dense. **Atleast** one layer of `base_LM` should be set to trainable. **5 marks**

6 Initiate the `loss`. **0.5 marks**

7 Fit the `model3`. Use `NUM_EPOCHS = 2`. **MAX_NUM_EPOCHS allowed will be 3**. **2 marks** 

8 Complete the `get_model_predicts()` to obtain predicted scores for input sentence pairs. **3.5 marks** 

9 Print the correlation scores on the dev and test set predictions. **1.5 mark**

Useful References: https://huggingface.co/blog/how-to-train-sentence-transformers 

In [14]:
if torch.cuda.is_available():
    dev = "cuda:0"
    device = torch.device(dev) 
else:
    print("No gpu found!!!!!!!!!")    

In [15]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses
from tqdm.auto import tqdm
from sentence_transformers import models
from torch import nn
from datetime import datetime
import math
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import  spearmanr


train_samples = []
dev_samples = []
test_samples = []


def form_data(data_frame, type):
  """
  Input a data frame and return the dataloder.
  """

  for row in range(len(data_frame)):
    score = float(data_frame.loc[row, "score"])/5
    inp_ex = InputExample(texts = [data_frame.loc[row, "sent_a"], data_frame.loc[row, "sent_b"]], label=score)
    if type == 'train':
      train_samples.append(inp_ex)
    if type == 'test':
      test_samples.append(inp_ex)
    if type == 'dev':
      dev_samples.append(inp_ex)
  if type == 'train':
    return DataLoader(train_samples, shuffle=True, batch_size=BATCH_SIZE)
  else:
    return None



def get_model_predicts(train_example, trained_model):
  """
  Input the dataset list and return a list of cosine similarity scores. Use the fitted final_trainable_model for obtaining encodings.
  """
  sentences1 = []
  sentences2 = []
  scores = []

  for example in train_example:
    sentences1.append(example.texts[0])
    sentences2.append(example.texts[1])
    scores.append(example.label)

  
  embedding1 = trained_model.encode(sentences1, batch_size=BATCH_SIZE, convert_to_numpy=True)
  embedding2 = trained_model.encode(sentences2, batch_size=BATCH_SIZE, convert_to_numpy=True)
  


  cosine_scores = 1 - (paired_cosine_distances(embedding1, embedding2))

  
  return cosine_scores

In [16]:
# create the dataloader for train data
dataloader_train = form_data(df_train, 'train')
form_data(df_test, 'test')
form_data(df_dev, 'dev')

In [17]:
# model intialization
# model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

layer_pooling = models.Pooling(CONEXTUAL_MODEL_TYPE.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, 
                               pooling_mode_cls_token=False, pooling_mode_max_tokens=False)

layer_dense = models.Dense(in_features=layer_pooling.get_sentence_embedding_dimension(), out_features=OUT_DIM_DENSE, activation_function=nn.Tanh())
# Activation used for dense layer is tanh

model3 = SentenceTransformer(modules=[CONEXTUAL_MODEL_TYPE, layer_pooling, layer_dense])

loss = losses.CosineSimilarityLoss(model = model3)

In [18]:
model3.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 768, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [13]:
# fit the model
model3.fit(train_objectives=[(dataloader_train, loss)], epochs=NUM_EPOCHS)

Iteration: 100%|██████████| 347/347 [00:33<00:00, 10.51it/s]
Iteration: 100%|██████████| 347/347 [00:32<00:00, 10.71it/s]
Iteration: 100%|██████████| 347/347 [00:35<00:00,  9.84it/s]
Epoch: 100%|██████████| 3/3 [01:40<00:00, 33.57s/it]


In [17]:
# prediction of spearman score on train data
cosine_scores = get_model_predicts(train_samples, model3)
spearman_scores, _ = spearmanr(df_train['score'], cosine_scores)
print(spearman_scores)

0.8212583479361734


In [18]:
# prediction of spearman score dev data
cosine_scores = get_model_predicts(dev_samples, model3)
spearman_scores, _ = spearmanr(df_dev['score'], cosine_scores)
print(spearman_scores)

0.8572441472850028


In [19]:
# prediction of spearman score on test data
cosine_scores = get_model_predicts(test_samples, model3)
spearman_scores, _ = spearmanr(df_test['score'], cosine_scores)
print(spearman_scores)

0.8182538548138503


In [20]:
# Code to freeze last but one layers

# for idx, (name, param) in enumerate(model.named_parameters()):
#     if idx <= 83:
#         param.requires_grad = False


# for idx, (name, param) in enumerate(model3.named_parameters()):
#     if param.requires_grad == True:
#         print(name)

0.auto_model.embeddings.word_embeddings.weight
0.auto_model.embeddings.position_embeddings.weight
0.auto_model.embeddings.token_type_embeddings.weight
0.auto_model.embeddings.LayerNorm.weight
0.auto_model.embeddings.LayerNorm.bias
0.auto_model.encoder.layer.0.attention.self.query.weight
0.auto_model.encoder.layer.0.attention.self.query.bias
0.auto_model.encoder.layer.0.attention.self.key.weight
0.auto_model.encoder.layer.0.attention.self.key.bias
0.auto_model.encoder.layer.0.attention.self.value.weight
0.auto_model.encoder.layer.0.attention.self.value.bias
0.auto_model.encoder.layer.0.attention.output.dense.weight
0.auto_model.encoder.layer.0.attention.output.dense.bias
0.auto_model.encoder.layer.0.attention.output.LayerNorm.weight
0.auto_model.encoder.layer.0.attention.output.LayerNorm.bias
0.auto_model.encoder.layer.0.intermediate.dense.weight
0.auto_model.encoder.layer.0.intermediate.dense.bias
0.auto_model.encoder.layer.0.output.dense.weight
0.auto_model.encoder.layer.0.output.dens