In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set display options to show all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 80)

In [None]:
df = pd.read_csv("data/english_annotated_full_df.csv")

In [None]:
df.head(1)

Question 1: How many manifestos and quasi sentences in total?

In [None]:
print("Total number q_sentences:", df.shape[0])
print("Total number of manifestos:", len(df["manifesto_id"].unique()))


In [None]:
temp = df.groupby(['countryname']).agg({'manifesto_id': 'nunique', 'date': ['min', 'max']})
temp

In [None]:
# How many documents per party in country:
temp = df.groupby(['countryname', 'partyname'])['manifesto_id'].nunique()

In [None]:
# here you can get the counts for the separat countries
temp["Ireland"]

In [None]:
# how many coders in total?
print("Total coders:", len(df["coderid"].unique()))
print("Average number of manifestos per coder:", len(df["manifesto_id"].unique())/len(df["coderid"].unique()))

Quasi sentences that are way too long (100 words or more): how many? These will be removed
Also: get examples

In [None]:
df_final = df[df["q_sentence_words"] < 100]
df_temp = df[df["q_sentence_words"] >= 100]
df_temp = df_temp.sort_values(by="q_sentence_words", ascending=False)

In [None]:
# Filter rows where strings contain a "." that is not at the end
filtered_df = df[df['q_sentence'].str.contains(r'\.(?!$)')]

In [None]:
# this shows we cant just remove q-sentences that contain a "." in the middle.
pd.set_option('display.max_colwidth', 200)
print(filtered_df["q_sentence"])
pd.set_option('display.max_colwidth', 80)

In [None]:
# infos on the longest one as an example:
df_temp.head(1)

In [None]:
# shortest one?
df_temp.tail(1)

In [None]:
# what codes are given here?
df_temp["main_codes"].value_counts()

In [None]:
pd.set_option('display.max_colwidth', None)
print(df_temp[df_temp["q_sentence_words"] == 100]["q_sentence"])
pd.set_option('display.max_colwidth', 80)

In [None]:
pd.set_option('display.max_colwidth', None)
print(df_temp[df_temp["q_sentence_words"] == 1025]["q_sentence"])
pd.set_option('display.max_colwidth', 80)

In [None]:
print("Number of q_sentences with at least 100 words:", df_temp.shape[0])
print("Length of the longest q_sentence:", max(df_temp["q_sentence_words"]), "words")


In [None]:
max(df_temp["q_sentence_words"])

Getting final sizes etc:

In [None]:
df_final.shape

In [None]:
# how many green and RILE?
print("Percentage of green quasi sentences:", df_final[df_final["main_codes"] == 501].shape[0]/df_final.shape[0])
print("Percentage of left quasi sentences:", df_final[df_final["RILE"] == 1].shape[0]/df_final.shape[0])
print("Percentage of right quasi sentences:", df_final[df_final["RILE"] == 2].shape[0]/df_final.shape[0])
print("Percentage of neutral quasi sentences:", df_final[df_final["RILE"] == 0].shape[0]/df_final.shape[0])

In [None]:
df_final["RILE"].value_counts()

From now on, df_final is used, with q_sentences under 100 words

What are the top and bottom frequency codes:

In [None]:
df_final.head(1)

In [None]:
code_distribution = df_final["main_codes"].value_counts()/df_final.shape[0]
code_distribution
# create top 5 bottom 5 table from this

In [None]:
# Create a bar plot
plt.figure(figsize=(10, 4))
code_distribution.plot(kind='bar')

# Add labels and title
plt.xlabel('Codes')
plt.ylabel('Frequency')
plt.title('Distribution of Manifesto Project codes - 2023a Corpus')

# Show the plot
plt.show()

In [None]:
# variance of the codes: (is this really helpful?)
# Calculate proportions of each main_code within each manifesto_id
proportions = df_final.groupby(['manifesto_id', 'main_codes']).size() / df_final.groupby('manifesto_id').size()

# Calculate the variance of these proportions across manifesto_ids for each main_code
variance_per_main_code = proportions.groupby('main_codes').var()

variance_per_main_code

Is the criticism true, that longer documents have longer quasi sentences?

In [None]:
df_final.head(1)

In [None]:
temp = df.groupby(['manifesto_id']).agg({'q_sentence_words': ['sum', 'mean']})

In [None]:
pd.set_option('display.max_rows', 10)
print(temp["q_sentence_words"])
pd.set_option('display.max_rows', None)

In [None]:
# Scatter plot
plt.figure(figsize=(8, 5))  # Adjust the figure size as needed
plt.scatter(temp[('q_sentence_words', 'sum')], temp[('q_sentence_words', 'mean')])

# Add labels and title
plt.xlabel('Total length of manifesto (in words)')
plt.ylabel('Average quasi-sentence length (in words)')
plt.title('Does manifesto length impact average quasi-sentence length?\n$r = 0.31$\n$R^2 = 0.1$')
plt.ylim(0, None)

# Show the plot
plt.grid(True)
plt.show()

In [None]:
corr_coef = temp[('q_sentence_words', 'sum')].corr(temp[('q_sentence_words', 'mean')])
print("r = ", corr_coef)
print("r^2 = ", corr_coef*corr_coef)

In [None]:
# exclude very large outlier:
filtered_temp = temp[temp[('q_sentence_words', 'sum')] <= 50000]
filtered_temp[('q_sentence_words', 'sum')].corr(filtered_temp[('q_sentence_words', 'mean')])

In [None]:
# Exclude outliers on both sides
# Determine the cutoff points for the top and bottom 5%
top_cutoff = temp[('q_sentence_words', 'sum')].quantile(0.95)
bottom_cutoff = temp[('q_sentence_words', 'sum')].quantile(0.05)

# Filter out rows based on the cutoff points
filtered_temp = temp[(temp[('q_sentence_words', 'sum')] >= bottom_cutoff) & 
                     (temp[('q_sentence_words', 'sum')] <= top_cutoff)]

# Calculate correlation coefficient
correlation_coefficient = filtered_temp[('q_sentence_words', 'sum')].corr(filtered_temp[('q_sentence_words', 'mean')])
correlation_coefficient

In [None]:
# What about a logarithmic relationship?
# Scatter plot with logarithmic transformation
plt.figure(figsize=(8, 5))  # Adjust the figure size as needed
plt.scatter(np.log(temp[('q_sentence_words', 'sum')]), (temp[('q_sentence_words', 'mean')]))

# Add labels and title
plt.xlabel('Log of total length of manifesto (in words)')
plt.ylabel('Average quasi-sentence length (in words)')
plt.title('Does manifesto length impact average quasi-sentence length?\n$r = 0.49$\n$R^2 = 0.24$')
plt.ylim(0, None)

# Show the plot
plt.grid(True)
plt.show()

In [None]:
corr_coef_log = np.log(temp[('q_sentence_words', 'sum')]).corr(temp[('q_sentence_words', 'mean')])
print("r = ", corr_coef_log)
print("r^2 = ", corr_coef_log*corr_coef_log)