This notebook combines the corpus/text data with the more general MPDS data, creating two xlsx files. One with all quasi sentences, one where quasi sentences without a code were removed

In [1]:
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [25]:
# let's load one of the prepared corpus datasets:
corpus_df_start = pd.read_excel("data/english_annotated_corpus.xlsx")
# mpds contains coderid etc for all documents (with party+date being the key)
mpds = pd.read_excel("data/MPDS2023a.xlsx")

In [26]:
# lets add info from mpds to the corpus df:
mpds_subset = mpds[["party", "date", "coderid", "countryname", "partyname", "pervote"]]
corpus_df = corpus_df_start.merge(mpds_subset, on=["date", "party"], how="left")

# lets also add columns telling us the length of the quasi-sentences
corpus_df["q_sentence_chars"] = corpus_df["q_sentence"].apply(len)
corpus_df["q_sentence_words"] = corpus_df["q_sentence"].apply(lambda x: len(str(x).split()))

# lets also create a column "main_codes" where the subcodes are changed back to their main code
# to do this, we basically just keep the first three characters of each code
def extract_main_code(x):
    # if it's not a string, it's NAN, so just return it
    return x[0:3] if isinstance(x, str) else x
corpus_df["main_codes"] = corpus_df["codes"].apply(extract_main_code)

In [27]:
# replace H and NA codes with numer (0 and -1 respectively)
corpus_df['main_codes'].replace('H', 0, inplace=True)
corpus_df['main_codes'].fillna(-1, inplace=True)

# Add RILE categorization:
left = [103, 105, 106, 107, 202, 403, 404, 406, 412, 413, 504, 506, 701] #coded as 1
right = [104, 201, 203, 305, 401, 402, 407, 414, 505, 601, 603, 605, 606] #coded as 2, neutral as 0
corpus_df['RILE'] = corpus_df['main_codes'].apply(lambda x: 1 if int(x) in left else (2 if int(x) in right else 0))

# full will still contains everything
corpus_df_full = corpus_df.copy()

#corpus_df removes H and NA rows:
corpus_df = corpus_df[corpus_df['main_codes'] != 0]
corpus_df = corpus_df[corpus_df['main_codes'] != -1]

# drop the current index, but saved it so we can correspond with the full version
corpus_df['full_index'] = corpus_df.index
corpus_df.reset_index(drop=True, inplace=True)

In [30]:
# save the dataframes
corpus_df.to_csv("data/english_annotated_df.csv", index=False)
corpus_df_full.to_csv("data/english_annotated_full_df.csv", index=False)