In [203]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [204]:
pd.options.display.max_columns = None

In [205]:
# load the MP q_sentence data and the party split info
corpus_df = pd.read_csv("data/english_annotated_full_df.csv") # full df or reduced df?
split_df = pd.read_csv("data/party_split.csv")

In [206]:
# reset the index of the corpus_df and save it as a column. This way we can always find and compare q_sentences later on
corpus_df = corpus_df.reset_index(names="original_index")

In [207]:
# some q_sentences are very long (and don't follow the rules of max. 1 sentence)
# this will be a problem for the max token input of roBERTa of 512 (esp. for the context model)
# we we will remove all q_sentences that have 100 or more words
#  these are 96 q_sentences in total, so not very many
corpus_df = corpus_df[corpus_df["q_sentence_words"] < 100]

In [208]:
corpus_df.head(2)

Unnamed: 0,original_index,q_sentence,q_sentence_nr,codes,manifesto_id,party,date,language,handbook,title,coderid,countryname,partyname,pervote,q_sentence_chars,q_sentence_words,main_codes,RILE
0,0,Which party will make a real difference to my ...,1,,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,64,11,-1,0
1,1,Our aim: To make Britain the world's foremost ...,2,411.0,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,71,12,411,0


In [209]:
split_df.head(2)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,Center,No
1,51902_199705,1997,United Kingdom,Scottish National Party,,Left,No


In [210]:
# replace NAs in left_right with "Unknown"
split_df['left_right'] = split_df['left_right'].fillna("Unknown")

In [211]:
# Recode the columns
# LEFT = 0, RIGHT = 1, Unknown = -1
left_right_dict = {"Far-left": 0,
                   "Left": 0,
                   "Center-left": 0,
                   "Center": 1,
                   "Center-right": 1,
                   "Right": 1,
                   "Unknown": -1}
split_df = split_df.assign(left_right  = split_df.left_right.map(left_right_dict))

# Green = 1
green_dict = {"Yes": 1,
              "No": 0}
split_df = split_df.assign(green  = split_df.green.map(green_dict))


In [212]:
split_df.head(2)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,1,0
1,51902_199705,1997,United Kingdom,Scottish National Party,,0,0


In [213]:
# merge left_right to the corpus_df:
corpus_df = corpus_df.merge(split_df, on="manifesto_id", how="left")

In [214]:
corpus_df.head(2)

Unnamed: 0,original_index,q_sentence,q_sentence_nr,codes,manifesto_id,party,date,language,handbook,title,coderid,countryname_x,partyname_x,pervote,q_sentence_chars,q_sentence_words,main_codes,RILE,year,countryname_y,partyname_y,migration_positive,left_right,green
0,0,Which party will make a real difference to my ...,1,,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,64,11,-1,0,1997,United Kingdom,Liberal Democrats,,1,0
1,1,Our aim: To make Britain the world's foremost ...,2,411.0,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,71,12,411,0,1997,United Kingdom,Liberal Democrats,,1,0


## Left-Right Split:

### Creating training data based on left-right Parties

Training data using only center/right parties (as that is more documents/q_sentences), split on a manifesto level into train and test sets (10% Test)

Inference dataset with all the rest

In [215]:
corpus_df["left_right"].value_counts()

left_right
 1    118197
 0     82879
-1      1015
Name: count, dtype: int64

In [216]:
left_df = corpus_df[corpus_df["left_right"] == 0]
other_df = corpus_df[corpus_df["left_right"] == 1] # removing the 4 unknown documents

In [217]:
# what columns do we need in the data for the model?
relevant_cols = ["q_sentence", "q_sentence_nr", "manifesto_id", "main_codes", "RILE", "original_index"]
left_df = left_df[relevant_cols]
other_df = other_df[relevant_cols]

In [218]:
manifesto_ids = other_df["manifesto_id"].unique()
# shuffle them
np.random.seed(2) # keep it reproducible (and so that ca. 10% of the q_sentences land in the test set)
np.random.shuffle(manifesto_ids)

train_manifesto_ids = manifesto_ids[10:] # this is 98 manifestos
test_manifesto_ids = manifesto_ids[:10] # this is 10 manifestos

train_df = other_df[other_df["manifesto_id"].isin(train_manifesto_ids)]
test_df = other_df[other_df["manifesto_id"].isin(test_manifesto_ids)]

In [219]:
print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/(test_df.shape[0]+train_df.shape[0]))

Number of q_sentences in the training set: 105626
Number of q_sentences in the test set: 12571
Percentage of the test set: 0.10635633730128514


In [220]:
# make sure that all the dfs are sorted correctly (so all the q_sentences of a manifesto are in the right order)
# This should already be the case, but just to make sure:
left_df = left_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
left_df.reset_index(drop=True, inplace=True)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [221]:
# and now save them as csv so that they can be loaded into huggingface
train_df.to_csv("data/model_splits/left_right_split/l_r_split_full_df/train_other.csv", index=False)
test_df.to_csv("data/model_splits/left_right_split/l_r_split_full_df/test_other.csv", index=False)
left_df.to_csv("data/model_splits/left_right_split/l_r_split_full_df/validation_inference_left.csv", index=False)