In [169]:
import spacy
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

import os
os.chdir('../scripts')
from utils import _load_test_labels
os.chdir('../notebooks')

nlp = spacy.load('en_core_web_md')

In [170]:
np.random.seed(42)

In [171]:
from datasets import load_dataset
dataset = load_dataset('glue', 'stsb')

Reusing dataset glue (C:\Users\Ivan\.cache\huggingface\datasets\glue\stsb\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [172]:
test_labels = _load_test_labels('../dataset/stsb').to_pandas()

In [173]:
train_dataset = dataset['train'].to_pandas()
validation_dataset = dataset['validation'].to_pandas()
test_dataset = pd.concat([dataset['test'].to_pandas().drop(columns=['label']), test_labels], axis=1)

In [174]:
test_dataset

Unnamed: 0,sentence1,sentence2,idx,label
0,A girl is styling her hair.,A girl is brushing her hair.,0,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,1,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,2,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,3,4.2
4,A man is playing a harp.,A man is playing a keyboard.,4,1.5
...,...,...,...,...
1374,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,1374,0.0
1375,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",1375,1.0
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,1376,1.0
1377,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,1377,0.0


In [183]:
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

def calculate_metrics(preds, labels):
    return pearsonr(preds, labels)[0], spearmanr(preds, labels)[0] 

In [176]:
lmbd = lambda x: nlp(x).vector

def sentence_to_vector(df):
    df['s1'] = df['sentence1'].map(lmbd)
    df['s2'] = df['sentence2'].map(lmbd)
    return df
    
print('starting . . .')

train_dataset = sentence_to_vector(train_dataset)
print('train done')

validation_dataset = sentence_to_vector(validation_dataset)
print('validation done')

test_dataset = sentence_to_vector(test_dataset)
print('test done')

starting . . .
train done
validation done
test done


In [177]:
train_dataset.head()

Unnamed: 0,sentence1,sentence2,label,idx,s1,s2
0,A plane is taking off.,An air plane is taking off.,5.0,0,"[-0.64949167, 0.25910184, -0.2652065, 0.119720...","[-0.70603573, 0.31555298, -0.18804415, 0.14615..."
1,A man is playing a large flute.,A man is playing a flute.,3.8,1,"[-0.65641505, 0.21840176, -0.20840888, 0.11604...","[-0.6445843, 0.25401455, -0.27212444, 0.066589..."
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8,2,"[-0.639469, 0.0666718, -0.158637, 0.0064331084...","[-0.6415846, 0.21705894, -0.053920902, -0.0195..."
3,Three men are playing chess.,Two men are playing chess.,2.6,3,"[-0.6869133, 0.018628487, -0.25191548, -0.3019...","[-0.6869133, 0.018628487, -0.25191548, -0.3019..."
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4,"[-0.6520457, 0.2612074, -0.24688442, 0.0471011...","[-0.6473375, 0.25767523, -0.25158888, 0.083552..."


In [184]:
def fit_predict_and_metrics(train, validation, test, model_f):
    model = model_f()
    
    def get_y_and_X(df):
        features = []
        for i, d in df.iterrows():
            s1s2_np = np.concatenate((d['s1'], d['s2']))
            features.append(s1s2_np)
        features = np.array(features)
        return df['label'].to_numpy(), features
    
    y_train, X_train = get_y_and_X(train) 
    model = model.fit(X_train, y_train)
    y_train_hat = model.predict(X_train)
    train_m = calculate_metrics(y_train_hat, y_train)
    print(f'Train metrics:\n    Pearson: {train_m[0]}\n   Spearman: {train_m[1]}\n')
    
    y_validation, X_validation = get_y_and_X(validation)
    y_validation_hat = model.predict(X_validation)
    validation_m = calculate_metrics(y_validation_hat, y_validation)
    print(f'Validation metrics:\n    Pearson: {validation_m[0]}\n   Spearman: {validation_m[1]}\n')

    y_test, X_test = get_y_and_X(test)
    y_test_hat = model.predict(X_test)
    test_m = calculate_metrics(y_test_hat, y_test)
    print(f'Test metrics:\n    Pearson: {test_m[0]}\n   Spearman: {test_m[1]}\n')
    
fit_predict_and_metrics(train_dataset, validation_dataset, test_dataset, LinearRegression)

Train metrics:
    Pearson: 0.4408617136534989
   Spearman: 0.42489968238155024

Validation metrics:
    Pearson: 0.11863867399613848
   Spearman: 0.11751661501175863

Test metrics:
    Pearson: 0.19445646782280276
   Spearman: 0.1931984648666329



In [185]:
fit_predict_and_metrics(train_dataset, validation_dataset, test_dataset, SVR)

Train metrics:
    Pearson: 0.5848223801553067
   Spearman: 0.5759629014285772

Validation metrics:
    Pearson: 0.2577315339976817
   Spearman: 0.2404171661281564

Test metrics:
    Pearson: 0.3303659360480432
   Spearman: 0.3007688387322799



In [187]:
def calculate_similarities(df):
    return np.array([cosine_similarity(df['s1'][i], df['s2'][i]) for i in range(df.shape[0])])

train_similarities = calculate_similarities(train_dataset)*5
train_m2 = calculate_metrics(train_similarities, train_dataset['label'].to_numpy())
print(f'Train metrics:\n    Pearson: {train_m2[0]}\n   Spearman: {train_m2[1]}\n')

validation_similarities = calculate_similarities(validation_dataset)*5
validation_m2 = calculate_metrics(validation_similarities, validation_dataset['label'].to_numpy())
print(f'Validation metrics:\n    Pearson: {validation_m2[0]}\n   Spearman: {validation_m2[1]}\n')

test_similarities = calculate_similarities(test_dataset)*5
test_m2 = calculate_metrics(test_similarities, test_dataset['label'].to_numpy())
print(f'Test metrics:\n    Pearson: {test_m2[0]}\n   Spearman: {test_m2[1]}\n')


Train metrics:
    Pearson: 0.4597313293136578
   Spearman: 0.4617373438229865

Validation metrics:
    Pearson: 0.4779523328801048
   Spearman: 0.5401938415875591

Test metrics:
    Pearson: 0.36717146827816377
   Spearman: 0.388169865695701

