In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr, spearmanr
import torch
from dm_functions_figqa import *
import dm_functions_figqa
import statistics
import pickle
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Datasets

In [6]:
# load dataset
data_long = pd.read_csv('Datasets/Fig-QA Dataset Copy.csv')
data_lem = pd.read_csv('Datasets/figqa_lemmatised.csv')
data_negsample = pd.read_excel('Datasets/negation sentences.xlsx')
data_negsample_lem = pd.read_csv('Datasets/negsents_lem.csv')


In [None]:
# set dataset to use and dataset name
data = data_negsample_lem
data = data[(data.labels!=-1)]
dataname = "negsents"

# Import embedding models

In [32]:
glove = import_embeddings('Models/glove_trained.txt',matrix=False)

In [33]:
w2v = import_embeddings('Models/word2vec_trained.txt',matrix=False)

In [3]:
# INFERSENT 

def import_infersent(version, data):
    
    from models import InferSent
    model_version = version
    MODEL_PATH = "Models/infersent%s.pkl" % model_version
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
    Infersentmodel = InferSent(params_model)
    Infersentmodel.load_state_dict(torch.load(MODEL_PATH))

    if version == 1:
        VEC_PATH = 'Models/glove.840B.300d.txt' 
    elif version == 2:
        VEC_PATH = 'Models/fasttext-crawl-300d-2M.vec'
    
    Infersentmodel.set_w2v_path(VEC_PATH)

    # Load embeddings of K most frequent words
    Infersentmodel.build_vocab_k_words(K=100000)

    sentences1 = [sent for sent in data.loc[:,'startphrase']]
    sentences2 = [sent for sent in data.loc[:,'ending1']]
    sentences3 = [sent for sent in data.loc[:,'ending2']]
    sentences = sentences1 + sentences2 + sentences3
    Infersentmodel.build_vocab(sentences)
    return Infersentmodel
.

from models import InferSent
model_version = 2
MODEL_PATH = "Models/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
InfersentFasttext = InferSent(params_model)
InfersentFasttext.load_state_dict(torch.load(MODEL_PATH))

VEC_PATH = 'Models/fasttext-crawl-300d-2M.vec'
InfersentFasttext.set_w2v_path(VEC_PATH)

sentences1 = [sent for sent in data.loc[:,'startphrase']]
sentences2 = [sent for sent in data.loc[:,'ending1']]
sentences3 = [sent for sent in data.loc[:,'ending2']]

sentences = sentences1 + sentences2 + sentences3
InfersentFasttext.build_vocab(sentences)

In [7]:
SBERTall = SentenceTransformer('all-MiniLM-L6-v2')
#SBERTpara = SentenceTransformer('paraphrase-MiniLM-L6-v2')
SBERT2 = SentenceTransformer('all-mpnet-base-v2')

# Compose and score
Calculate cosine similartity for each sentence pair

In [4]:
import warnings
warnings.filterwarnings(action='ignore')


In [5]:
def comptest(embeddings, sent_data, method, pos1="", pos2="", pos3="", op="", prep=""):
    # Call compose_and_score for each sentence and return new columns of results
    cos_scores1 = []
    cos_scores2 = []
    classifications = []
    for i, row in sent_data.iterrows():
        nullvalue = False
        try:
            cos1 = compose_and_score(embeddings, row['startphrase'], row['ending1'], method) # "[CLS]"+row['startphrase'/ending] for SBERT
            cos_scores1.append(cos1)
        except KeyError:
            cos_scores1.append(None)
            nullvalue = True
        try:
            cos2 = compose_and_score(embeddings, row['startphrase'], row['ending2'], method)# "[CLS]"+row['startphrase'/ending] for SBERT
            cos_scores2.append(cos2)
        except KeyError:
            cos_scores2.append(None)
            nullvalue = True
            
            
        if nullvalue:
            classifications.append(None)
        else:
            try:
                if sdata.iloc()[i]['labels'] == 0:
                    if  cos1 > cos2: 
                        classification = 1
                    else:
                        classification = 0
                elif sdata.iloc()[i]['labels'] == 1:
                    if  cos2 > cos1: 
                        classification = 1
                    else:
                        classification = 0
                else:
                    classification = ""
                classifications.append(classification)
            except TypeError:
                classifications.append("N/A")
            
            
    return cos_scores1, cos_scores2, classifications

#def score(sentence_data, model):
#    data_dropna = sentence_data[[sentence_data['labels', 'cos1 {}'.format(model), 'cos2 {}'.format(model)]].dropna()

#    return spearman

def scoreauc(sentence_data, model):
    data_dropna = sentence_data[sentence_data['cosine similarity {}'.format(model)].notna()]
    aptness_scores = data_dropna['Aptness'].replace('apt', 1).replace('inapt',0)
    model_scores = data_dropna['cosine similarity {}'.format(model)]
    rocauc = roc_auc_score(aptness_scores, model_scores)
    return rocauc


# Run tests

In [None]:
sdata = data # results will append to sdata
correlations = {}
auc_scores = {}

In [13]:
models = (("sbertall", SBERTall),("sbert2", SBERT2)) #("sbertpara", SBERTpara),
# ("name for output", model object)

for model, MODEL in models:
    cos_scores1 = []
    cos_scores2 = []
    cos_scores_posneg = []
    classifications = []
    
    for i, row in sdata.iterrows():
        try:
            cos1 = cosine_similarity([MODEL.encode(row['startphrase']), MODEL.encode(row['ending1'])])[0,1]
            cos_scores1.append(cos1)
        except KeyError:
            cos_scores1.append(None)
        try:
            cos2 = cosine_similarity([MODEL.encode(row['startphrase']), MODEL.encode(row['ending2'])])[0,1]
            cos_scores2.append(cos2)
        except KeyError:
            cos_scores2.append(None)
            
        try:
            cos_posneg = cosine_similarity([MODEL.encode(row['ending1']), MODEL.encode(row['ending2'])])[0,1]
            cos_scores_posneg.append(cos_posneg)
        except KeyError:
            cos_scores_posneg.append(None)

        try:
            if sdata.iloc()[i]['labels'] == 0:
                if  cos1 > cos2: 
                    classification = 1
                else:
                    classification = 0
            elif sdata.iloc()[i]['labels'] == 1:
                if  cos2 > cos1: 
                    classification = 1
                else:
                    classification = 0
            else:
                classification = ""
            classifications.append(classification)
        except TypeError:
            classifications.append("N/A")

    sdata['cos1 {}'.format(model+"_"+dataname)], sdata['cos2 {}'.format(model+"_"+dataname)] = cos_scores1, cos_scores2
    sdata['cos_posneg {}'.format(model+"_"+dataname)] = cos_scores_posneg
    sdata['bin {}'.format(model+"_"+dataname)] = classifications
    print(np.mean(classifications))
    

KeyboardInterrupt: 

In [None]:
model = "sbert2"

cos_scores1 = []
cos_scores2 = []
classifications = []
for i, row in sdata.iterrows():
    try:
        cos1 = cosine_similarity([SBERT2.encode(row['startphrase']), SBERT2.encode(row['ending1'])])[0,1]
        cos_scores1.append(cos1)
    except KeyError:
        cos_scores1.append(None)
    try:
        cos2 = cosine_similarity([SBERT2.encode(row['startphrase']), SBERT2.encode(row['ending2'])])[0,1]
        cos_scores2.append(cos2)
    except KeyError:
        cos_scores2.append(None)
        
    try:
        if sdata.iloc()[i]['labels'] == 0:
            if  cos1 > cos2: 
                classification = 1
            else:
                classification = 0
        elif sdata.iloc()[i]['labels'] == 1:
            if  cos2 > cos1: 
                classification = 1
            else:
                classification = 0
        else:
            classification = ""
        classifications.append(classification)
    except TypeError:
        classifications.append("N/A")

sdata['cos1 {}'.format(model+"_"+dataname)], sdata['cos2 {}'.format(model+"_"+dataname)] = cos_scores1, cos_scores2
sdata['bin {}'.format(model+"_"+dataname)] = classifications

In [12]:
model = "infersent1"

InfersentGloVe = import_infersent(1, data)

cos_scores1 = []
cos_scores2 = []
classifications = []
for i, row in sdata.iterrows():
    try:
        cos1 = cosine_similarity(InfersentGloVe.encode([row['startphrase']], tokenize=False), InfersentGloVe.encode([row['ending1']], tokenize=False))[0,0]
        cos_scores1.append(cos1)
    except KeyError:
        cos_scores1.append(None)
    try:
        cos2 = cosine_similarity(InfersentGloVe.encode([row['startphrase']], tokenize=False), InfersentGloVe.encode([row['ending2']], tokenize=False))[0,0]
        cos_scores2.append(cos2)
    except KeyError:
        cos_scores2.append(None)
        
        
    try:
        if sdata.iloc()[i]['labels'] == 0:
            if  cos1 > cos2: 
                classification = 1
            else:
                classification = 0
        elif sdata.iloc()[i]['labels'] == 1:
            if  cos2 > cos1: 
                classification = 1
            else:
                classification = 0
        else:
            classification = ""
        classifications.append(classification)
    except TypeError:
        classifications.append("N/A")

sdata['cos1 {}'.format(model+"_"+dataname)], sdata['cos2 {}'.format(model+"_"+dataname)] = cos_scores1, cos_scores2
sdata['bin {}'.format(model+"_"+dataname)] = classifications

Vocab size : 100000
Found 491(/492) words with w2v vectors
Vocab size : 491


In [13]:
model = "infersent2"

InfersentFasttext = import_infersent(2, data)

cos_scores1 = []
cos_scores2 = []
classifications = []
for i, row in sdata.iterrows():
    try:
        cos1 = cosine_similarity(InfersentFasttext.encode([row['startphrase']], tokenize=False), InfersentFasttext.encode([row['ending1']], tokenize=False))[0,0]
        cos_scores1.append(cos1)
    except KeyError:
        cos_scores1.append(None)
    try:
        cos2 = cosine_similarity(InfersentFasttext.encode([row['startphrase']], tokenize=False), InfersentFasttext.encode([row['ending2']], tokenize=False))[0,0]
        cos_scores2.append(cos2)
    except KeyError:
        cos_scores2.append(None)

        
    try:
        if sdata.iloc()[i]['labels'] == 0:
            if  cos1 > cos2: 
                classification = 1
            else:
                classification = 0
        elif sdata.iloc()[i]['labels'] == 1:
            if  cos2 > cos1: 
                classification = 1
            else:
                classification = 0
        else:
            classification = ""
        classifications.append(classification)
    except TypeError:
        classifications.append("N/A")
        
sdata['cos1 {}'.format(model+"_"+dataname)], sdata['cos2 {}'.format(model+"_"+dataname)] = cos_scores1, cos_scores2
sdata['bin {}'.format(model+"_"+dataname)] = classifications

Vocab size : 100000
Found 491(/492) words with w2v vectors
Vocab size : 491


In [80]:
# Run all vector and dm models

models = ["glove", "w2v", 'ms-word2dm-c5', 'ms-word2dm-c10', 'ms-word2dm-d5', 'ms-word2dm-d10']
methods = ["add", "mult"]
sdata = data
for model in models:
    embeddings = glove if model == 'glove' else w2v if model == "w2v" else import_embeddings('trained_dms_txt/{}.txt'.format(model),matrix=True)
    for method in methods:
        sdata['cos1 {}'.format(model+" "+method+"_"+dataname)], sdata['cos2 {}'.format(model+" "+method+"_"+dataname)], sdata['bin {}'.format(model+" "+method+"_"+dataname)] = comptest(embeddings, sdata, method)
        

In [14]:
sdata.to_csv("FIG-QA/test_embeddings_{}.csv".format(dataname), index=False)