Prepare and analyze the cosine similarity results

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from scipy.sparse import save_npz, load_npz

In [55]:
# load the sparse matrix with the extreme similarity values:
cs_sparse = load_npz('data/cos_sim_output_08_mpnet.npz')

# also load the full corpus df, so that we can add create a df with q_sentences, information and their similarity score for extreme values
corpus_df_full = pd.read_csv("data/english_annotated_full_df.csv")

In [56]:
cs_sparse

<202187x202187 sparse matrix of type '<class 'numpy.float32'>'
	with 379546 stored elements in Compressed Sparse Row format>

In [57]:
# convert the sparse matrix into a useful df:
cs_sparse = coo_matrix(cs_sparse)
data = {
    'index_x': cs_sparse.row,
    'index_y': cs_sparse.col,
    'cosine_similarity': cs_sparse.data
}
cosine_sim_df = pd.DataFrame(data)

In [58]:
cosine_sim_df.head(2)

Unnamed: 0,index_x,index_y,cosine_similarity
0,7,9,0.851987
1,7,119,0.903366


In [59]:
corpus_df = corpus_df_full[['q_sentence', 'main_codes', 'coderid', 'manifesto_id', 'party', 'date',
        'title', 'countryname', 'partyname', 'RILE']]

In [60]:
# append the columns from the corpus_df, so that we get the info for both quasi sentences
cosine_sim_df = pd.merge(cosine_sim_df, corpus_df, left_on='index_x', right_index=True, how='left')
cosine_sim_df = pd.merge(cosine_sim_df, corpus_df, left_on='index_y', right_index=True, how='left')

In [51]:
cosine_sim_df.head(3)

Unnamed: 0,index_x,index_y,cosine_similarity,q_sentence_x,manifesto_id_x,party_x,date_x,title_x,coderid_x,countryname_x,...,q_sentence_y,manifesto_id_y,party_y,date_y,title_y,coderid_y,countryname_y,partyname_y,main_codes_y,RILE_y
0,7,9,0.851987,Our first priority is to:,51421_199705,51421,199705,Make the Difference,102,United Kingdom,...,Key priorities are to:,51421_199705,51421,199705,Make the Difference,102,United Kingdom,Liberal Democrats,-1,0
1,7,119,0.903366,Our first priority is to:,51421_199705,51421,199705,Make the Difference,102,United Kingdom,...,Our priorities are to:,51421_199705,51421,199705,Make the Difference,102,United Kingdom,Liberal Democrats,-1,0
2,7,215,0.903366,Our first priority is to:,51421_199705,51421,199705,Make the Difference,102,United Kingdom,...,Our priorities are to:,51421_199705,51421,199705,Make the Difference,102,United Kingdom,Liberal Democrats,-1,0


In [64]:
# How many values do we have that are below -0.8?
cosine_sim_df[cosine_sim_df["cosine_similarity"] < 0].shape
# --> none of them!

(0, 23)

In [65]:
# How often are codes not equal?
len(cosine_sim_df[cosine_sim_df['main_codes_x'] != cosine_sim_df['main_codes_y']])/cosine_sim_df.shape[0]
# --> about 24% of the similar codes (>0.8) are not equal, needs a closer look of course.

0.23964684122609645