# Read data

In [1]:
import pandas as pd

In [2]:
train_filename = "dataset/sts-b_train.csv"
data_train = pd.read_csv(train_filename) 
data_train.head()

Unnamed: 0,sentence1,sentence2,score,id
0,A plane is taking off.,An air plane is taking off.,5.0,0
1,A man is playing a large flute.,A man is playing a flute.,3.8,1
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8,2
3,Three men are playing chess.,Two men are playing chess.,2.6,3
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4


In [3]:
test_filename = "dataset/sts-b_test.csv"
data_test = pd.read_csv(test_filename) 
data_test.head()

Unnamed: 0,id,sentence1,sentence2
0,100000,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.
1,100001,A young child is riding a horse.,A child is riding a horse.
2,100002,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.
3,100003,A woman is playing the guitar.,A man is playing guitar.
4,100004,A woman is playing the flute.,A man is playing a flute.


# Sentence transformers model

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
model = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

### Train data

In [7]:
import time

In [8]:
train_sentence1 = data_train['sentence1'].tolist()
train_sentence2 = data_train['sentence2'].tolist()

In [9]:
start_time = time.time()
emdeb_train_1 = model.encode(train_sentence1, show_progress_bar=True)
emdeb_train_2 = model.encode(train_sentence2, show_progress_bar=True)
end_time = time.time()
print('Train sentences encoded in ' + str(end_time-start_time))

Batches: 100%|███████████████████████████████████████████████████████████████████████| 713/713 [00:51<00:00, 13.77it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████| 713/713 [00:52<00:00, 13.64it/s]


Train sentences encoded in 104.08112931251526


### Test data

In [10]:
test_sentence1 = data_test['sentence1'].tolist()
test_sentence2 = data_test['sentence2'].tolist()

In [11]:
start_time = time.time()
emdeb_test_1 = model.encode(test_sentence1)
emdeb_test_2 = model.encode(test_sentence2)
end_time = time.time()
print('Test sentences encoded in ' + str(end_time-start_time))

Test sentences encoded in 23.702237367630005


# Cosine similarity

In [12]:
from scipy.spatial.distance import cosine 

In [13]:
def calculate_cosine_similarity(embedding1, embedding2):
    cosine_similarity = cosine(embedding1, embedding2)
    return 1 - cosine_similarity

In [14]:
NUMBER_SENTENCES_TRAIN = len(emdeb_train_1)

In [15]:
cosine_train = []

for i in range(NUMBER_SENTENCES_TRAIN):
    score = calculate_cosine_similarity(emdeb_train_1[i], emdeb_train_2[i]) 
        
    cosine_train.append(score)

# Euclidean Distance

In [16]:
from sklearn.metrics.pairwise import euclidean_distances

In [17]:
def calculate_euclidean_similarity(embedding1, embedding2):
    return euclidean_distances([embedding1], [embedding2])

In [18]:
euclidean_train = []

for i in range(NUMBER_SENTENCES_TRAIN):
    score = calculate_euclidean_similarity(emdeb_train_1[i], emdeb_train_2[i]) 
        
    euclidean_train.append(score[0][0])

In [19]:
euclidean_train[0:5]

[3.9185338, 10.984579, 13.481616, 18.971045, 7.6697454]

# Manhattan Distance

In [20]:
from sklearn.metrics.pairwise import manhattan_distances

In [21]:
def calculate_manhattan_similarity(embedding1, embedding2):
    return manhattan_distances([embedding1], [embedding2])

In [22]:
manhattan_train = []

for i in range(NUMBER_SENTENCES_TRAIN):
    score = calculate_manhattan_similarity(emdeb_train_1[i], emdeb_train_2[i]) 
        
    manhattan_train.append(score[0][0])

In [23]:
manhattan_train[0:5]

[100.73560256196652,
 279.86062546956236,
 343.22615252342075,
 493.15096923476085,
 195.8692759034575]

# Spearmanr correlation (train)

In [24]:
from scipy.stats import spearmanr

In [25]:
score = data_train['score'].tolist()

print("Score pour Sentence-Bert (cosine) : " + str(spearmanr(cosine_train, score)[0]))

Score pour Sentence-Bert (cosine) : 0.9901353542010408


In [26]:
score = data_train['score'].tolist()

print("Score pour Sentence-Bert (euclidean) : " + str(spearmanr(euclidean_train, score)[0]))

Score pour Sentence-Bert (euclidean) : -0.9877641779636843


In [27]:
score = data_train['score'].tolist()

print("Score pour Sentence-Bert (manhattan) : " + str(spearmanr(manhattan_train, score)[0]))

Score pour Sentence-Bert (manhattan) : -0.9875924132641959


On choisit donc d'utiliser la similarite cosine comme metrique.

# Test value

In [28]:
NUMBER_SENTENCES_TEST = len(emdeb_test_1)

In [29]:
cosine_test = []

for i in range(NUMBER_SENTENCES_TEST):
    score = calculate_cosine_similarity(emdeb_test_1[i], emdeb_test_2[i])
        
    cosine_test.append(score)

In [30]:
euclidean_test = []

for i in range(NUMBER_SENTENCES_TEST):
    score = calculate_euclidean_similarity(emdeb_test_1[i], emdeb_test_2[i]) 
        
    euclidean_test.append(score[0][0])

In [31]:
manhattan_test = []

for i in range(NUMBER_SENTENCES_TEST):
    score = calculate_manhattan_similarity(emdeb_test_1[i], emdeb_test_2[i]) 
        
    manhattan_test.append(score[0][0])

In [34]:
resultat = pd.DataFrame(columns=['id', 'score'])

In [35]:
resultat['id'] = data_test['id']
resultat['score'] = cosine_test

In [36]:
resultat.head()

Unnamed: 0,id,score
0,100000,0.996951
1,100001,0.987699
2,100002,0.995232
3,100003,0.550654
4,100004,0.779314


In [37]:
resultat.to_csv('./results/SentenceTransformer_cosine.csv', index=False)