In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Getting our Data

In [2]:
train_path = '/workspace/grasp-data-hometask-semantic-similarity-master/data/train.data'
val_path = '/workspace/grasp-data-hometask-semantic-similarity-master/data/dev.data'

In [3]:
import pandas as pd
col_names = ['Topic_Id', 'Topic_Name', 'Sent_1', 'Sent_2', 'Label', 'Sent_1_tag', 'Sent_2_tag']
train_df = pd.read_csv(train_path, sep='\t', lineterminator='\n', names=col_names, header=None)
train_df.head(3)

Unnamed: 0,Topic_Id,Topic_Name,Sent_1,Sent_2,Label,Sent_1_tag,Sent_2_tag
0,4,1st QB,EJ Manuel the 1st QB to go in this draft,But my bro from the 757 EJ Manuel is the 1st Q...,"(5, 0)",EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,But/O/CC/O/O my/O/PRP$/B-NP/O bro/O/NN/I-NP/O ...
1,4,1st QB,EJ Manuel the 1st QB to go in this draft,Can believe EJ Manuel went as the 1st QB in th...,"(5, 0)",EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,Can/O/MD/B-VP/O believe/O/VB/I-VP/B-EVENT EJ/B...
2,4,1st QB,EJ Manuel the 1st QB to go in this draft,EJ MANUEL IS THE 1ST QB what,"(3, 2)",EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,EJ/B-person/NNP/B-NP/O MANUEL/I-person/NNP/I-N...


In [4]:
val_df = pd.read_csv(val_path, sep='\t', lineterminator='\n', names=col_names, header=None)
val_df.head(3)

Unnamed: 0,Topic_Id,Topic_Name,Sent_1,Sent_2,Label,Sent_1_tag,Sent_2_tag
0,17,A Walk To Remember,A Walk to Remember is the definition of true love,A Walk to Remember is on and Im in town and Im...,"(1, 4)",A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...
1,17,A Walk To Remember,A Walk to Remember is the definition of true love,A Walk to Remember is the cutest thing,"(3, 2)",A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...
2,17,A Walk To Remember,A Walk to Remember is the definition of true love,A walk to remember is on ABC family youre welcome,"(1, 4)",A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,A/O/DT/B-NP/O walk/O/NN/I-NP/O to/O/TO/B-VP/O ...


##### converting all label to binary

In [5]:
def preproc(df):
    '''convert our label to 0-5'''
    df.loc[df['Label']== '(0, 5)', 'Label'] = 0
    df.loc[df['Label']== '(1, 4)', 'Label'] = 1
    df.loc[df['Label']== '(2, 3)', 'Label'] = 2
    df.loc[df['Label']== '(3, 2)', 'Label'] = 3
    df.loc[df['Label']== '(4, 1)', 'Label'] = 4
    df.loc[df['Label']== '(5, 0)', 'Label'] = 5
    return df

train_df = preproc(train_df)
val_df = preproc(val_df)

In [6]:
train_df

Unnamed: 0,Topic_Id,Topic_Name,Sent_1,Sent_2,Label,Sent_1_tag,Sent_2_tag
0,4,1st QB,EJ Manuel the 1st QB to go in this draft,But my bro from the 757 EJ Manuel is the 1st Q...,5,EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,But/O/CC/O/O my/O/PRP$/B-NP/O bro/O/NN/I-NP/O ...
1,4,1st QB,EJ Manuel the 1st QB to go in this draft,Can believe EJ Manuel went as the 1st QB in th...,5,EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,Can/O/MD/B-VP/O believe/O/VB/I-VP/B-EVENT EJ/B...
2,4,1st QB,EJ Manuel the 1st QB to go in this draft,EJ MANUEL IS THE 1ST QB what,3,EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,EJ/B-person/NNP/B-NP/O MANUEL/I-person/NNP/I-N...
3,4,1st QB,EJ Manuel the 1st QB to go in this draft,EJ da 1st QB off da board,2,EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,EJ/O/NNP/B-NP/O da/O/DT/I-NP/O 1st/O/CD/I-NP/O...
4,4,1st QB,EJ Manuel the 1st QB to go in this draft,Manuel is the 1st QB to get drafted,4,EJ/B-person/NNP/B-NP/O Manuel/I-person/NNP/B-V...,Manuel/B-person/NNP/B-NP/O is/O/VBZ/B-VP/O the...
...,...,...,...,...,...,...,...
13058,1891,iPhone 5,It fits the larger iPhone 5,My brother thirsty for this iPhone 5 so ill gu...,0,It/O/PRP/B-NP/O fits/O/VBZ/B-VP/O the/O/DT/B-N...,My/O/PRP$/B-NP/O brother/O/NN/I-NP/O thirsty/O...
13059,1891,iPhone 5,It fits the larger iPhone 5,Should I get the iPhone 5 or an Android,0,It/O/PRP/B-NP/O fits/O/VBZ/B-VP/O the/O/DT/B-N...,Should/O/UH/O/O I/O/PRP/B-NP/O get/O/VBP/B-VP/...
13060,1891,iPhone 5,It fits the larger iPhone 5,Somebody bring me an iPhone 5 charger to my work,0,It/O/PRP/B-NP/O fits/O/VBZ/B-VP/O the/O/DT/B-N...,Somebody/O/NN/B-NP/O bring/O/VB/B-VP/B-EVENT m...
13061,1891,iPhone 5,It fits the larger iPhone 5,Unlocked iPhone 5 300 hit me,0,It/O/PRP/B-NP/O fits/O/VBZ/B-VP/O the/O/DT/B-N...,Unlocked/O/NNP/B-NP/O iPhone/O/NNP/I-NP/O 5/O/...


#### Preparing Data

In [7]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample, models

2023-10-16 12:01:31.524909: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-16 12:01:31.696090: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-16 12:01:32.297256: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-10-16 12:01:32.297344: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [9]:
def prepare_samples(df): 
    res = []
    for _, row in df.iterrows():
        score = float(row['Label']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['Sent_1'], row['Sent_2']], label=score)
        res.append(inp_example)
    return res

In [10]:
train_samples = prepare_samples(train_df)
val_samples = prepare_samples(val_df)

In [11]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

In [12]:
train_loss = losses.CosineSimilarityLoss(model=model)

train_dataloader = torch.utils.data.DataLoader(train_samples, shuffle=True, batch_size=16)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples)

####  Training the network

In [16]:
import math

num_epochs = 10
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up


In [17]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path='./results_minilm_2')

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1633 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1633 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1633 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1633 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1633 [00:00<?, ?it/s]

#### Inference

In [8]:
model_save_path = './results_roberta_base'

In [9]:
model = SentenceTransformer(model_save_path)

In [10]:
sentences = ["All the home alones watching 8 mile", "8 mile is on thats my movie"]
encodings = model.encode(sentences)

In [11]:
from sentence_transformers import SentenceTransformer, util
cosine_scores = util.cos_sim(encodings[0], encodings[1])
cosine_scores

tensor([[0.2080]])

In [12]:
cosine_scores.item()

0.20799854397773743

In [13]:
test_path = '/workspace/grasp-data-hometask-semantic-similarity-master/data/test.data'

In [14]:
results = []
with open(test_path) as tf:
    for tline in tf:
        lines = tline.split('\t')
        sentences = lines[2:4]
        encodings = model.encode(sentences)
        cosine_scores = util.cos_sim(encodings[0], encodings[1]).item()
        
        if cosine_scores >= 0.5:
            results.append("true\t" + "{0:.4f}".format(cosine_scores) + "\n")
        else: #if cosine_scores <= 0.4: 
            results.append("false\t" + "{0:.4f}".format(cosine_scores) + "\n")
        

In [20]:
res_path = '/workspace/grasp-data-hometask-semantic-similarity-master/systemoutputs/PIT2015_BASELINE_01_sbert_roberta.output'
with open(res_path, 'w+') as f:
    for line in results:
        f.write(line)