Prepare and analyze the cosine similarity results

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from scipy.sparse import save_npz, load_npz
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# What to do with this? Generate examples for all!

# Look at examples, what is our cos sim threshold?
# Remove where both are -1? and both are 0?
# Which codes appear most often?
# Which codes appear most often when agreement?
# Which codes appear most often in disagreement?
# Look at sentences that are exactly the same: how often is there disagreement?
# What changes if we limit to combinations by different coders/from different documents?
# in 0/-1 codes: how often do coders agree, that there is no meaning in this?

In [None]:
# Set display options to show full contents of each column
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
# reverse with:
# pd.reset_option('display.max_colwidth')

In [None]:
# load the sparse matrix with the extreme similarity values:
cs_sparse = load_npz('data/cos_sim_output_08_mpnet.npz')

# also load the full corpus df, so that we can add create a df with q_sentences, information and their similarity score for extreme values
corpus_df_full = pd.read_csv("data/english_annotated_full_df.csv")

In [None]:
cs_sparse

In [None]:
# convert the sparse matrix into a useful df:
cs_sparse = coo_matrix(cs_sparse)
data = {
    'index_x': cs_sparse.row,
    'index_y': cs_sparse.col,
    'cosine_similarity': cs_sparse.data
}
cosine_sim_df = pd.DataFrame(data)

In [None]:
cosine_sim_df.head(2)

In [None]:
cosine_sim_df.shape

In [None]:
corpus_df = corpus_df_full[['q_sentence', "q_sentence_nr", 'main_codes', 'coderid', 'manifesto_id', 'party', 'date',
        'title', 'countryname', 'partyname', 'RILE']]

In [None]:
# append the columns from the corpus_df, so that we get the info for both quasi sentences
cosine_sim_df = pd.merge(cosine_sim_df, corpus_df, left_on='index_x', right_index=True, how='left')
cosine_sim_df = pd.merge(cosine_sim_df, corpus_df, left_on='index_y', right_index=True, how='left')

In [None]:
cosine_sim_df.head(3)

In [None]:
# How many values do we have that are below -0.8?
cosine_sim_df[cosine_sim_df["cosine_similarity"] < 0].shape
# --> none of them!

In [None]:
cosine_sim_df.shape

Looking at the lowest cosine similarities. Are they good enough?

In [None]:
# cosine_sim_df.sort_values(by="cosine_similarity",
#                           ascending=True)[["q_sentence_x", "main_codes_x", "q_sentence_y", "main_codes_y", "cosine_similarity"]].head(20)

cosine_sim_df.sort_values(by="cosine_similarity",
                          ascending=True)[["manifesto_id_x", "q_sentence_nr_x", "q_sentence_x", "q_sentence_y", "manifesto_id_y", "q_sentence_nr_y", "cosine_similarity"]].head(5)

Yes, they look very good! (We could even think about lowering the threshold in the calculation script)

General overview: how often do codes appear? How often in agreement / disagreement?
This is done here before removing certain combinations:

In [None]:
# Just simple: how often are the same codes given?
cosine_sim_df[cosine_sim_df["main_codes_x"] == cosine_sim_df["main_codes_y"]].shape[0]/cosine_sim_df.shape[0]

In [None]:
codes = np.union1d(cosine_sim_df["main_codes_x"].unique(), cosine_sim_df["main_codes_y"].unique())
total_counts = dict()
agreement_counts = dict()
disagreement_counts = dict()

for c in codes:
    count = ((cosine_sim_df['main_codes_x'] == c) | (cosine_sim_df['main_codes_y'] == c)).sum()
    total_counts[c] = count
    
    count = ((cosine_sim_df['main_codes_x'] == c) & (cosine_sim_df['main_codes_y'] == c)).sum()
    agreement_counts[c] = count

    count = ((cosine_sim_df['main_codes_x'] != c) ^ (cosine_sim_df['main_codes_y'] != c)).sum()
    disagreement_counts[c] = count

# Create a dictionary with the percentage of disagreement for this code
# so disagreement_count/total_count
perc_dict = dict()
for k,v in total_counts.items():
    perc_dict[k] = disagreement_counts[k]/v

# Getting the dicts into a df:

merged_df = pd.merge(pd.DataFrame(list(total_counts.items()), columns=['code', 'total_counts']),
                     pd.DataFrame(list(perc_dict.items()), columns=['code', 'perc_disagree']),
                     on='code')

merged_df_temp = pd.merge(pd.DataFrame(list(agreement_counts.items()), columns=['code', 'agreement_counts']),
                     pd.DataFrame(list(disagreement_counts.items()), columns=['code', 'disagreement_counts']),
                     on='code')

code_combinations_df_full = pd.merge(merged_df, merged_df_temp, on='code')

# add the percentage how often a code appears in the similar sentence combinations:
code_combinations_df_full["perc_total_occurance"] = code_combinations_df_full["total_counts"]/cosine_sim_df.shape[0]

code_combinations_df_full.sort_values(by="perc_total_occurance", ascending=False, inplace=True)
code_combinations_df_full.head(3)

In [None]:
# Plot the most occuring codes:
top_ten_codes = code_combinations_df_full.head(10)["code"].astype(str)
top_ten_percentages = code_combinations_df_full.head(10)["perc_total_occurance"]

plt.bar(top_ten_codes, top_ten_percentages)

plt.xlabel('Code')
plt.ylabel('Frequency')
plt.title('Most occuring codes in similar quasi-sentences')

In [None]:
# How often do coders agree that similar sentences have no content/useful meaning? So how often do both give 0 or -1? How often does only one give 0 or -1?

# How often does either -1 or 0 appear in one or both of the codes:
count_a = cosine_sim_df[(((cosine_sim_df['main_codes_x'] == -1) | (cosine_sim_df['main_codes_x'] == 0)) | (
    (cosine_sim_df['main_codes_y'] == -1) | (cosine_sim_df['main_codes_y'] == 0)))].shape[0]

# How often do both codes have either -1 or 0:
count_b = cosine_sim_df[(((cosine_sim_df['main_codes_x'] == -1) | (cosine_sim_df['main_codes_x'] == 0)) & (
    (cosine_sim_df['main_codes_y'] == -1) | (cosine_sim_df['main_codes_y'] == 0)))].shape[0]

# How often does only one code have either -1 or 0:
count_c = cosine_sim_df[(((cosine_sim_df['main_codes_x'] == -1) | (cosine_sim_df['main_codes_x'] == 0)) ^ (
    (cosine_sim_df['main_codes_y'] == -1) | (cosine_sim_df['main_codes_y'] == 0)))].shape[0]

# So, how often do coders disagree on these?
print(count_c/count_a)

In [None]:
print(count_b/cosine_sim_df.shape[0])

In [None]:
# What is the percentage of all combinations?
print(count_c/cosine_sim_df.shape[0])

In [None]:
x = ["Only one -1/0", "Both -1/0"]
y = [count_c/count_a, 1-count_c/count_a]

plt.bar(x, y)

plt.ylabel('Frequency')
plt.title('How often do both codes contain -1 or 0?')

# Annotate the bars with their corresponding values
for i in range(len(x)):
    plt.text(i, y[i], f'{y[i]*100:.2f}%', ha='center', va='bottom')

plt.show()

--> In about 91% of the time, coders agree if there is no meaning in a q_sentence. 9% of the time, one of them sees a meaning!

Removing rows where coders agree on -1 or 0 (so that this q_sentence has no "meaning" per se):

This removes 251.679 combinations, or 66.3% of all combinations

In [None]:
# Remove all rows where both codes are -1; here the coders agree that it is not a topic that needs to be coded, likely just formating etc.
#cosine_sim_df = cosine_sim_df[(cosine_sim_df['main_codes_x'] != -1) | (cosine_sim_df['main_codes_y'] != -1)]
# same thing for both codes 0
#cosine_sim_df = cosine_sim_df[(cosine_sim_df['main_codes_x'] != 0) | (cosine_sim_df['main_codes_y'] != 0)]


# and now also where one is -1 and one is 0: the coders agree, there is no meaning in this

# This does it all in one: removes all rows where both codes are either 0 or -1:
cosine_sim_df = cosine_sim_df[~(((cosine_sim_df['main_codes_x'] == -1) | (cosine_sim_df['main_codes_x'] == 0)) & (
    (cosine_sim_df['main_codes_y'] == -1) | (cosine_sim_df['main_codes_y'] == 0)))]


cosine_sim_df.shape

How often do coders disagree on these similar sentences?

In [None]:
# add a column that indicates if coders agreed
cosine_sim_df["in_agreement"] = cosine_sim_df['main_codes_x'] == cosine_sim_df['main_codes_y']

In [None]:
cosine_sim_df.head(1)

In [None]:
# How often are codes not equal?
1 - sum(cosine_sim_df["in_agreement"])/cosine_sim_df.shape[0]

In [None]:
cosine_sim_df[cosine_sim_df["cosine_similarity"] > 0.975][["coderid_x", "manifesto_id_x", "q_sentence_nr_x", "q_sentence_x", "main_codes_x",
                                                           "q_sentence_y", "main_codes_y", "manifesto_id_y", "q_sentence_nr_y", "coderid_y"]].head(10)

How often do coders agree on the same codes? Does this change as the cosine similarity increases?

In [None]:
# Define the bins for column A
plot_df = cosine_sim_df.copy()

bins = [0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95, 0.975, 1.0]

# Create a new column 'Bin' which indicates the bin that each value in column A falls into
plot_df['Bin'] = pd.cut(plot_df['cosine_similarity'], bins)

# Calculate the percentage of True values in each bin
bin_percentages = plot_df.groupby('Bin')['in_agreement'].mean() * 100

# Create a bar plot
bars = plt.bar(bin_percentages.index.astype(str), bin_percentages, color='skyblue', edgecolor='black')

# Add percentage labels above the bars
for bar, percentage in zip(bars, bin_percentages):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percentage:.1f}%', 
             ha='center', va='bottom', fontsize=10)

# Add labels and title
plt.xlabel('Cosine Similarity')
plt.ylabel('Percentage of combinations with the same code')
plt.title('How often do coders agree on the same codes?')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility
# Add '%' sign to the y-axis labels
plt.gca().set_yticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_yticks()])

plt.show()

In [None]:
plot_df = cosine_sim_df.copy()
plot_df = plot_df[cosine_sim_df["manifesto_id_x"] != cosine_sim_df["manifesto_id_y"]]

In [None]:
# How does this change if we only take combinations of different documents?
# (So coders dont just pick the same code for the sentence they see again and again?)

# Define the bins for column A
plot_df = cosine_sim_df.copy()
# remove combinations from the same document:
plot_df = plot_df[cosine_sim_df["manifesto_id_x"] != cosine_sim_df["manifesto_id_y"]]
print(plot_df.shape)

bins = [0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95, 0.975, 1.0]

# Create a new column 'Bin' which indicates the bin that each value in column A falls into
plot_df['Bin'] = pd.cut(plot_df['cosine_similarity'], bins)

# Calculate the percentage of True values in each bin
bin_percentages = plot_df.groupby('Bin')['in_agreement'].mean() * 100

# Create a bar plot
bars = plt.bar(bin_percentages.index.astype(str), bin_percentages, color='skyblue', edgecolor='black')

# Add percentage labels above the bars
for bar, percentage in zip(bars, bin_percentages):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percentage:.1f}%', 
             ha='center', va='bottom', fontsize=10)

# Add labels and title
plt.xlabel('Cosine Similarity')
plt.ylabel('Percentage of codes in agreement')
plt.title('How often do coders agree on the same codes?\nOnly looking at combinations from different manifestos')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility
# Add '%' sign to the y-axis labels
plt.gca().set_yticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_yticks()])

plt.show()

In [None]:
# Having a look at certain cosine similarity ranges:

cosine_sim_df[(cosine_sim_df['cosine_similarity'] >= 0.95) & (cosine_sim_df['cosine_similarity'] <= 0.975)].head(10)[["coderid_x", "manifesto_id_x", "q_sentence_nr_x", "q_sentence_x", "main_codes_x",
                                                           "q_sentence_y", "main_codes_y", "manifesto_id_y", "q_sentence_nr_y", "coderid_y"]]

In [None]:
cosine_sim_df[(cosine_sim_df['cosine_similarity'] >= 0.975) & (cosine_sim_df['cosine_similarity'] <= 1.1)].head(10)[["cosine_similarity","coderid_x", "manifesto_id_x", "q_sentence_nr_x", "q_sentence_x", "main_codes_x",
                                                           "q_sentence_y", "main_codes_y", "manifesto_id_y", "q_sentence_nr_y", "coderid_y"]]

Which codes appear most often? Which most often in agreement? Which most often when not in agreement?

In [None]:
codes = np.union1d(cosine_sim_df["main_codes_x"].unique(), cosine_sim_df["main_codes_y"].unique())

In [None]:
total_counts = dict()
agreement_counts = dict()
disagreement_counts = dict()
domain_agreement_counts = dict()
domain_disagreement_counts = dict()

for c in codes:
    count = ((cosine_sim_df['main_codes_x'] == c) | (cosine_sim_df['main_codes_y'] == c)).sum()
    total_counts[c] = count
    
    count = ((cosine_sim_df['main_codes_x'] == c) & (cosine_sim_df['main_codes_y'] == c)).sum()
    agreement_counts[c] = count

    count = ((cosine_sim_df['main_codes_x'] != c) ^ (cosine_sim_df['main_codes_y'] != c)).sum()
    disagreement_counts[c] = count

    domain = str(c)[0]
    # select all rows where one of the codes is c and both start with the same number as c:
    count = cosine_sim_df[((cosine_sim_df["main_codes_x"] == c) | (cosine_sim_df["main_codes_y"] == c)) &
                ((cosine_sim_df["main_codes_x"].astype(str).str[0] == domain)
                & (cosine_sim_df["main_codes_y"].astype(str).str[0] == domain))].shape[0]
    domain_agreement_counts[c] = count

    # select all rows where one of the codes is c and one is from a different domain:
    count = cosine_sim_df[((cosine_sim_df["main_codes_x"] == c) | (cosine_sim_df["main_codes_y"] == c)) &
                ~((cosine_sim_df["main_codes_x"].astype(str).str[0] == domain)
                & (cosine_sim_df["main_codes_y"].astype(str).str[0] == domain))].shape[0]
    domain_disagreement_counts[c] = count


In [None]:
# Create a dictionary with the percentage of disagreement for this code
# so disagreement_count/total_count
# also for domain disagreement

perc_dict = dict()
domain_perc_dict = dict()
for k,v in total_counts.items():
    perc_dict[k] = disagreement_counts[k]/v
    domain_perc_dict[k] = domain_disagreement_counts[k]/v

In [None]:
# Getting the dicts into a df:
merged_df = pd.merge(pd.DataFrame(list(total_counts.items()), columns=['code', 'total_counts']),
                     pd.DataFrame(list(perc_dict.items()), columns=['code', 'perc_disagree']),
                     on='code')

merged_df_temp = pd.merge(pd.DataFrame(list(agreement_counts.items()), columns=['code', 'agreement_counts']),
                     pd.DataFrame(list(disagreement_counts.items()), columns=['code', 'disagreement_counts']),
                     on='code')

code_combinations_df = pd.merge(merged_df, merged_df_temp, on='code')

merged_df_temp_2 = pd.merge(pd.DataFrame(list(domain_agreement_counts.items()), columns=['code', 'domain_agreement_counts']),
                     pd.DataFrame(list(domain_disagreement_counts.items()), columns=['code', 'domain_disagreement_counts']),
                     on='code')

merged_df_temp_3 = pd.merge(merged_df_temp_2,
                            pd.DataFrame(list(domain_perc_dict.items()), columns=['code', 'domain_perc_disagree']),
                            on='code' )

code_combinations_df = pd.merge(code_combinations_df, merged_df_temp_3, on = "code")

# add the percentage how often a code appears in the similar sentence combinations:
code_combinations_df["perc_total_occurance"] = code_combinations_df["total_counts"]/cosine_sim_df.shape[0]

code_combinations_df.sort_values(by="perc_total_occurance", ascending=False, inplace=True)

code_combinations_df.head(3)

In [None]:
code_combinations_df[code_combinations_df["code"] == 702]

In [None]:
code_combinations_df[code_combinations_df["perc_disagree"] < 0.5]

In [None]:
#example for 408
#cosine_sim_df[(cosine_sim_df["main_codes_x"] == 408) & (cosine_sim_df["main_codes_y"] != 408)].head(20)

In [None]:
# testing if the numbers make sense
#cosine_sim_df[(cosine_sim_df["main_codes_x"] == 702) & (cosine_sim_df["main_codes_y"] == 702)].shape
# total counts is all combinations * 2 (as each comb has two codes) - the number of combinations that are in agreement (as here only one code appears)
sum(code_combinations_df["total_counts"])

In [None]:
cosine_sim_df.shape[0]*2 - sum(code_combinations_df["agreement_counts"])

In [None]:
# Plot the most occuring codes:
code_combinations_df.sort_values(by="perc_total_occurance", ascending=False, inplace=True)

top_ten_codes = code_combinations_df.head(20)["code"].astype(str)
top_ten_percentages = code_combinations_df.head(20)["perc_total_occurance"]

plt.bar(top_ten_codes, top_ten_percentages)

plt.xlabel('Codes')
plt.ylabel('Frequency')
plt.title('Frequency of combinations that contain this code (top 20)')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Similar thing, but sorted by codes that are the most contentious:
code_combinations_df.sort_values(by="perc_disagree", ascending=False, inplace=True)

# Plot the most occuring codes:
top_ten_codes = code_combinations_df.head(20)["code"].astype(str)
top_ten_percentages = code_combinations_df.head(20)["perc_disagree"]
top_ten_total_counts = code_combinations_df.head(20)["total_counts"]

bars = plt.bar(top_ten_codes, top_ten_percentages)

# Add total counts as text on top of each bar
# for bar, count in zip(bars, top_ten_total_counts):
#     plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
#              ha='center', va='bottom', rotation = 45)

plt.xlabel('Code')
plt.ylabel('Frequency of code occurence where\ncombination is in disagreement')
plt.title('Most unreliable codes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Same thing, but sorted by codes that are the least contentious:
code_combinations_df.sort_values(by="perc_disagree", ascending=True, inplace=True)

# Plot the most occuring codes:
top_ten_codes = code_combinations_df.head(20)["code"].astype(str)
top_ten_percentages = code_combinations_df.head(20)["perc_disagree"]
top_ten_total_counts = code_combinations_df.head(20)["total_counts"]

bars = plt.bar(top_ten_codes, top_ten_percentages)

# Add total counts as text on top of each bar
# for bar, count in zip(bars, top_ten_total_counts):
#     plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
#              ha='center', va='bottom', rotation = 45)

plt.xlabel('Code')
plt.ylabel('Frequency of code occurence where\ncombination is in disagreement')
plt.title('Most reliable codes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
code_combinations_df.head(10)

In [None]:
len(code_combinations_df["code"].unique())

What about getting at least the domain right? What codes actually switch domain often?

In [None]:
# Same thing, but sorted by codes that most often change domains:
code_combinations_df.sort_values(by="domain_perc_disagree", ascending=False, inplace=True)

# Plot the most occuring codes:
x = 20
top_ten_codes = code_combinations_df.head(x)["code"].astype(str)
top_ten_percentages = code_combinations_df.head(x)["domain_perc_disagree"]
top_ten_total_counts = code_combinations_df.head(x)["total_counts"]

bars = plt.bar(top_ten_codes, top_ten_percentages)

# Add total counts as text on top of each bar
for bar, count in zip(bars, top_ten_total_counts):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
             ha='center', va='bottom', rotation = 45)

plt.xlabel('Codes')
plt.ylabel('Occurrence')
plt.title('Percentage of combinations that contain this code and a\ncode from a different domain (top 20)\nWith total occurence count on top of the bars')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Same thing, but sorted by codes that least often change domains:
code_combinations_df.sort_values(by="domain_perc_disagree", ascending=True, inplace=True)

# Plot the most occuring codes:
top_ten_codes = code_combinations_df.head(20)["code"].astype(str)
top_ten_percentages = code_combinations_df.head(20)["domain_perc_disagree"]
top_ten_total_counts = code_combinations_df.head(20)["total_counts"]

bars = plt.bar(top_ten_codes, top_ten_percentages)

# Add total counts as text on top of each bar
for bar, count in zip(bars, top_ten_total_counts):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
             ha='center', va='bottom', rotation = 45)

plt.xlabel('Codes')
plt.ylabel('Occurrence')
plt.title('Percentage of combinations that contain this code and a\ncode from a different domain (bottom 20)\nWith total occurence count on top of the bars')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
code_combinations_df.head(10)

Deeper look into sentences that are exactly the same:

In [None]:
same_df = cosine_sim_df[cosine_sim_df["q_sentence_x"] == cosine_sim_df["q_sentence_y"]]
same_df.shape

In [None]:
# how often are the codes the same?
same_df[same_df["main_codes_x"] == same_df["main_codes_y"]].shape[0]/same_df.shape[0]

In [None]:
same_df[["q_sentence_x", "main_codes_x", "q_sentence_y", "main_codes_y",
         "cosine_similarity"]].sort_values(by="cosine_similarity", ascending=True).head(5)

In [None]:
same_tmp_df = same_df[same_df["main_codes_x"] != same_df["main_codes_y"]]
same_tmp_df.tail(20)[["q_sentence_x", "main_codes_x", "q_sentence_y", "main_codes_y",
         "cosine_similarity"]]

In [None]:
test = cosine_sim_df[(cosine_sim_df["main_codes_x"] == 416) & (cosine_sim_df["main_codes_y"] == 501)]

In [None]:
test.head(50)

In [None]:
# testing domain confusion matrix
both = cosine_sim_df[(cosine_sim_df["main_codes_x"]>=700) & (cosine_sim_df["main_codes_y"]>=700)].shape[0]
all = cosine_sim_df[(cosine_sim_df["main_codes_x"]>=700) | (cosine_sim_df["main_codes_y"]>=700)].shape[0]
both/all

In [None]:
cosine_sim_df[(cosine_sim_df["main_codes_x"]>=700) | (cosine_sim_df["main_codes_y"]>=700)].shape[0]

In [None]:
cosine_sim_df.head(5)

In [None]:
(cosine_sim_df["main_codes_x"]/100).astype(int)

In [None]:
cosine_sim_df["domain_x"] = (cosine_sim_df["main_codes_x"]/100).astype(int)
cosine_sim_df["domain_y"] = (cosine_sim_df["main_codes_y"]/100).astype(int)

In [None]:
cosine_sim_df["domain_x"].value_counts()

In [None]:
confusion_matrix = pd.crosstab(cosine_sim_df['domain_x'], cosine_sim_df['domain_y'])

In [None]:
confusion_matrix