In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from scipy.sparse import save_npz, load_npz
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set display options to show full contents of each column
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
# reverse with:
# pd.reset_option('display.max_colwidth')

In [None]:
## Setting up the similarity df

# load the sparse matrix with the extreme similarity values:
cs_sparse = load_npz('data/cos_sim_output_08_mpnet.npz')

# also load the full corpus df, so that we can add create a df with q_sentences, information and their similarity score for extreme values
corpus_df_full = pd.read_csv("data/english_annotated_full_df.csv")
cs_sparse
# convert the sparse matrix into a useful df:
cs_sparse = coo_matrix(cs_sparse)
data = {
    'index_x': cs_sparse.row,
    'index_y': cs_sparse.col,
    'cosine_similarity': cs_sparse.data
}

cosine_sim_df = pd.DataFrame(data)
cosine_sim_df.head(2)
cosine_sim_df.shape
corpus_df = corpus_df_full[['q_sentence', "q_sentence_nr", 'main_codes', 'coderid', 'manifesto_id', 'party', 'date',
        'title', 'countryname', 'partyname', 'RILE']]

# append the columns from the corpus_df, so that we get the info for both quasi sentences
cosine_sim_df = pd.merge(cosine_sim_df, corpus_df, left_on='index_x', right_index=True, how='left')
cosine_sim_df = pd.merge(cosine_sim_df, corpus_df, left_on='index_y', right_index=True, how='left')

In [None]:
cosine_sim_df.head(1)

In [None]:
cosine_sim_df.shape

In [None]:
# load the MP q_sentence data and the party split info
split_df = pd.read_csv("data/party_split.csv").drop(columns="migration_positive")

# replace NAs in left_right with "Unknown"
split_df['left_right'] = split_df['left_right'].fillna("Unknown")
# Recode the columns
# LEFT = 1, CENTER = 0, RIGHT = 2, Unknown = -1 --> similar to RILE coding in the corpus_dfs
left_right_dict = {"Far-left": 1,
                   "Left": 1,
                   "Center-left": 1,
                   "Center": 0,
                   "Center-right": 2,
                   "Right": 2,
                   "Unknown": -1}
split_df = split_df.assign(left_right  = split_df.left_right.map(left_right_dict))

# Green = 1
green_dict = {"Yes": 1,
              "No": 0}
split_df = split_df.assign(green  = split_df.green.map(green_dict))

split_lr = split_df[["manifesto_id", "left_right"]]
split_green = split_df[["manifesto_id", "green"]]

In [None]:
# join the info about the party association to the similarity df:
df = cosine_sim_df.merge(split_lr, left_on = "manifesto_id_x", right_on = "manifesto_id").drop(columns="manifesto_id")
df.rename(columns={"left_right": "left_right_party_x"}, inplace=True)
df = df.merge(split_lr, left_on = "manifesto_id_y", right_on = "manifesto_id").drop(columns="manifesto_id")
df.rename(columns={"left_right": "left_right_party_y"}, inplace=True)

df = df.merge(split_green, left_on = "manifesto_id_x", right_on = "manifesto_id").drop(columns="manifesto_id")
df.rename(columns={"green": "green_party_x"}, inplace=True)
df = df.merge(split_green, left_on = "manifesto_id_y", right_on = "manifesto_id").drop(columns="manifesto_id")
df.rename(columns={"green": "green_party_y"}, inplace=True)

# get rid of uneccesary columns
df = df.drop(columns=["index_x", "index_y", "party_x", "party_y", "date_x", "date_y"])

In [None]:
df.head(1)

### Finding examples: Green party splits

1: Green party gets assigned 501, other party gets assigned 703

In [None]:
tmp = df[( (df["green_party_x"] == 1) & (df["green_party_y"] == 0) & (df["main_codes_x"] == 501) & (df["main_codes_y"] == 703) ) | 
         ( (df["green_party_x"] == 0) & (df["green_party_y"] == 1) & (df["main_codes_x"] == 703) & (df["main_codes_y"] == 501) )]

In [None]:
tmp.shape

In [None]:
# some examples from the same coder!
tmp[tmp["coderid_x"] == tmp["coderid_y"]].shape

In [None]:
#expand to see more
tmp.head(1)

In [None]:
tmp[tmp["coderid_x"] == tmp["coderid_y"]].head()

opposite examples:

In [None]:
tmp = df[( (df["green_party_x"] == 1) & (df["green_party_y"] == 0) & (df["main_codes_x"] == 703) & (df["main_codes_y"] == 501) ) | 
         ( (df["green_party_x"] == 0) & (df["green_party_y"] == 1) & (df["main_codes_x"] == 501) & (df["main_codes_y"] == 703) )]
tmp.shape

2: Green party gets 416, non-Green party gets 501

In [None]:
tmp = df[( (df["green_party_x"] == 1) & (df["green_party_y"] == 0) & (df["main_codes_x"] == 416) & (df["main_codes_y"] == 501) ) | 
         ( (df["green_party_x"] == 0) & (df["green_party_y"] == 1) & (df["main_codes_x"] == 501) & (df["main_codes_y"] == 416) )]

In [None]:
tmp.shape

In [None]:
# so even a lot of examples from the same coder!
tmp[tmp["coderid_x"] == tmp["coderid_y"]].shape

In [None]:
# go through the instances:
tmp.sort_values(by="cosine_similarity", ascending=False).iloc[10:40]

oppopsite examples:

In [None]:
tmp = df[( (df["green_party_x"] == 1) & (df["green_party_y"] == 0) & (df["main_codes_x"] == 501) & (df["main_codes_y"] == 416) ) | 
         ( (df["green_party_x"] == 0) & (df["green_party_y"] == 1) & (df["main_codes_x"] == 416) & (df["main_codes_y"] == 501) )]
tmp.shape
tmp = tmp.sort_values(by="cosine_similarity")

In [None]:
tmp.iloc[0:30]