In [31]:
#import libraries
import os
import torch
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
import tensorflow as tf
import numpy as np
import pandas as pd
import subprocess

In [37]:
#all-MiniLM -L6-v2
model_1 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')

#all-mpnet-base-v2
model_2 = SentenceTransformer("sentence-transformers/all-mpnet-base-v2",device = 'cpu')

#t5
model_3 = SentenceTransformer('sentence-transformers/sentence-t5-large',device = 'cpu')

#Roberta
model_4 = SentenceTransformer('sentence-transformers/msmarco-roberta-base-v2', device = 'cpu')

#all-distilroberta-v1
model_5 = SentenceTransformer('sentence-transformers/all-distilroberta-v1', device = 'cpu')




#Dataset preprocessing - removing metadata
def processing(input_file,output_file,model) :

    with open(input_file, "r") as in_file , open(output_file, "w") as out_file:
        for line in in_file :
            s = line.strip().split("\t")
            s_1 = s[0]
            s_2 = s[1]
            embedding_1 = embeddings(model,s_1)
            embedding_2 = embeddings(model,s_2)
            similarity_score = similarity(model,embedding_1,embedding_2)
            out_file.write(f"{similarity_score}\n")




#Scaled similarity
def similarity (m,s1,s2) :
  #Cosine similarity is in [-1,1], scaled similarity is between [0 ,5]
  if m == "model_1" :
       return (model_1.similarity(s1,s2).item() +1)*2.5
  elif m == "model_2" :
       return (model_2.similarity(s1,s2).item() +1)*2.5
  elif m == "model_3" :
       return (model_3.similarity(s1,s2).item() +1)*2.5
  elif m == "model_4" :
       return (model_4.similarity(s1,s2).item() +1)*2.5
  elif m == "model_5" :
       return (model_5.similarity(s1,s2).item() +1)*2.5


#Generate embedding based on model
def embeddings (m,s) :
    
    if m == "model_1" :
       return model_1.encode(s)
    elif m == "model_2" :
       return model_2.encode(s)
    elif m == "model_3" :
       return model_3.encode(s)
    
    elif m == "model_4" :
       return model_4.encode(s)
    elif m == "model_5" :
       return model_5.encode(s)


#Main program

folder_path = "sts2016-english-with-gs-v1.0/"
input_files = [
    folder_path + "STS2016.input.answer-answer.txt",
    folder_path + "STS2016.input.headlines.txt",
    folder_path + "STS2016.input.plagiarism.txt",
    folder_path + "STS2016.input.postediting.txt",
    folder_path + "STS2016.input.question-question.txt",
]
models = ["model_1",
          "model_2",
          "model_3",
          "model_4",
          "model_5"
          ]

gs_files = [
    folder_path + "STS2016.gs.answer-answer.txt",
    folder_path + "STS2016.gs.headlines.txt",
    folder_path + "STS2016.gs.plagiarism.txt",
    folder_path + "STS2016.gs.postediting.txt",
    folder_path + "STS2016.gs.question-question.txt",
]

perl_script = folder_path + "correlation-noconfidence.pl"

for model in models :
  for i in range(0,len (input_files)) :

    part_1 = input_files[i].strip().split("/")[0]
    part_2 = input_files[i].strip().split("/")[1]
    output_file = f"{part_1}/output_{model}_{part_2}"

    processing(input_files[i],output_file,model)

    pearson_score_file = f"{part_1}/score_{model}"

    with open(pearson_score_file,"a") as ps_file :
        try :
            result = subprocess.run (
                ["perl", perl_script, gs_files[i], output_file ],
                capture_output= True,
                text = True,
                check = True
            )
            res = result.stdout.strip()
            ps_file.write(f"Score {i + 1}: {res}\n")

        except subprocess.CalledProcessError as e:
            print("Error running Perl script:")
            print(e.stderr)











huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av