In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
pd.options.display.max_columns = None

In [3]:
# load the MP q_sentence data and the party split info
corpus_df = pd.read_csv("data/english_annotated_full_df.csv") # full df or reduced df?
split_df = pd.read_csv("data/party_split.csv")

In [4]:
# reset the index of the corpus_df and save it as a column. This way we can always find and compare q_sentences later on
corpus_df = corpus_df.reset_index(names="original_index")

In [5]:
# some q_sentences are very long (and don't follow the rules of max. 1 sentence)
# this will be a problem for the max token input of roBERTa of 512 (esp. for the context model)
# we we will remove all q_sentences that have 100 or more words
#  these are 96 q_sentences in total, so not very many
corpus_df = corpus_df[corpus_df["q_sentence_words"] < 100]

In [6]:
corpus_df.head(2)

Unnamed: 0,original_index,q_sentence,q_sentence_nr,codes,manifesto_id,party,date,language,handbook,title,coderid,countryname,partyname,pervote,q_sentence_chars,q_sentence_words,main_codes,RILE
0,0,Which party will make a real difference to my ...,1,,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,64,11,-1,0
1,1,Our aim: To make Britain the world's foremost ...,2,411.0,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,71,12,411,0


In [7]:
split_df.head(2)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,Center,No
1,51902_199705,1997,United Kingdom,Scottish National Party,,Left,No


In [8]:
split_df.shape

(209, 7)

In [9]:
split_df["left_right"].value_counts(dropna=False)

left_right
Left            80
Right           64
Center          22
Center-right    22
Center-left     15
NaN              4
Far-left         2
Name: count, dtype: int64

In [10]:
split_df["green"].value_counts(dropna=False)

green
No     189
Yes     20
Name: count, dtype: int64

In [11]:
# replace NAs in left_right with "Unknown"
split_df['left_right'] = split_df['left_right'].fillna("Unknown")

In [12]:
# Recode the columns
# TODO: FIX THIS FOR Left vs. center+right and right vs. center+left
# LEFT = 0, RIGHT = 1, Unknown = -1
left_right_dict = {"Far-left": 0,
                   "Left": 0,
                   "Center-left": 0,
                   "Center": 1,
                   "Center-right": 1,
                   "Right": 1,
                   "Unknown": -1}
split_df = split_df.assign(left_right  = split_df.left_right.map(left_right_dict))

# Green = 1
green_dict = {"Yes": 1,
              "No": 0}
split_df = split_df.assign(green  = split_df.green.map(green_dict))


In [13]:
split_df.head(2)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,1,0
1,51902_199705,1997,United Kingdom,Scottish National Party,,0,0


In [14]:
# merge left_right to the corpus_df:
corpus_df = corpus_df.merge(split_df, on="manifesto_id", how="left")

In [15]:
# add a binary label column for green codes (so code 501 is 1, rest is 0)
corpus_df["green_code"] = [1 if x == 501 else 0 for x in corpus_df["main_codes"]]

In [16]:
corpus_df.head(2)

Unnamed: 0,original_index,q_sentence,q_sentence_nr,codes,manifesto_id,party,date,language,handbook,title,coderid,countryname_x,partyname_x,pervote,q_sentence_chars,q_sentence_words,main_codes,RILE,year,countryname_y,partyname_y,migration_positive,left_right,green,green_code
0,0,Which party will make a real difference to my ...,1,,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,64,11,-1,0,1997,United Kingdom,Liberal Democrats,,1,0,0
1,1,Our aim: To make Britain the world's foremost ...,2,411.0,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,71,12,411,0,1997,United Kingdom,Liberal Democrats,,1,0,0


In [17]:
# get all green parties:
corpus_df[corpus_df["green"] == 1]["partyname_x"].unique() # careful, two different parties are named "Green Party"

array(['Green Party of England and Wales', 'Green Party',
       'Australian Greens', 'Green Party of Aotearoa New Zealand'],
      dtype=object)

## Left-Right Split:

### Creating training data based on left-right Parties

Training data using only center/right parties (as that is more documents/q_sentences), split on a manifesto level into train and test sets (10% Test)

Inference dataset with all the rest

In [14]:
corpus_df["left_right"].value_counts()

left_right
 1    118197
 0     82879
-1      1015
Name: count, dtype: int64

In [15]:
left_df = corpus_df[corpus_df["left_right"] == 0]
other_df = corpus_df[corpus_df["left_right"] == 1] # removing the 4 unknown documents

In [16]:
# what columns do we need in the data for the model?
relevant_cols = ["q_sentence", "q_sentence_nr", "manifesto_id", "main_codes", "RILE", "original_index"]
left_df = left_df[relevant_cols]
other_df = other_df[relevant_cols]

In [17]:
manifesto_ids = other_df["manifesto_id"].unique()
# shuffle them
np.random.seed(2) # keep it reproducible (and so that ca. 10% of the q_sentences land in the test set)
np.random.shuffle(manifesto_ids)

train_manifesto_ids = manifesto_ids[10:] # this is 98 manifestos
test_manifesto_ids = manifesto_ids[:10] # this is 10 manifestos

train_df = other_df[other_df["manifesto_id"].isin(train_manifesto_ids)]
test_df = other_df[other_df["manifesto_id"].isin(test_manifesto_ids)]

In [18]:
print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/(test_df.shape[0]+train_df.shape[0]))

Number of q_sentences in the training set: 105626
Number of q_sentences in the test set: 12571
Percentage of the test set: 0.10635633730128514


In [19]:
# make sure that all the dfs are sorted correctly (so all the q_sentences of a manifesto are in the right order)
# This should already be the case, but just to make sure:
left_df = left_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
left_df.reset_index(drop=True, inplace=True)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [221]:
# and now save them as csv so that they can be loaded into huggingface
train_df.to_csv("data/model_splits/left_right_split/l_r_split_full_df/train_other.csv", index=False)
test_df.to_csv("data/model_splits/left_right_split/l_r_split_full_df/test_other.csv", index=False)
left_df.to_csv("data/model_splits/left_right_split/l_r_split_full_df/validation_inference_left.csv", index=False)

## Green Split
Creating Training data based on non-green parties (and an inference set with the green parties)

In [18]:
corpus_df["green"].value_counts()

green
0    178827
1     23264
Name: count, dtype: int64

In [19]:
green_df = corpus_df[corpus_df["green"] == 1]
other_df = corpus_df[corpus_df["green"] == 0]

In [20]:
# what columns do we need in the data for the model?
relevant_cols = ["q_sentence", "q_sentence_nr", "manifesto_id", "main_codes", "green_code", "original_index"]
green_df = green_df[relevant_cols]
other_df = other_df[relevant_cols]

In [46]:
# green as train
# train: 70% Green
# validation: 15% Green
# test: 15% Green
# inference: 100% non-Green

# split on manifestos:
manifesto_ids = green_df["manifesto_id"].unique()
np.random.seed(13) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[10:] # 10 manifestos
val_manifesto_ids = manifesto_ids[:3] # this is 3 manifestos
test_manifesto_ids = manifesto_ids[3:10] # 7 manifestos

train_df = green_df[green_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = green_df[green_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = green_df[green_df["manifesto_id"].isin(test_manifesto_ids)]
inference_df = other_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/green_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/green_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/green_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_df = inference_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
train_df.to_csv("data/model_splits/green_split/green_as_train/train-00000-of-00001.csv", index=False)
val_df.to_csv("data/model_splits/green_split/green_as_train/validation-00000-of-00001.csv", index=False)
test_df.to_csv("data/model_splits/green_split/green_as_train/test-00000-of-00001.csv", index=False)
inference_df.to_csv("data/model_splits/green_split/green_as_train/inference-00000-of-00001.csv", index=False)


Number of q_sentences in the training set: 16189
Number of q_sentences in the validation set: 3578
Number of q_sentences in the test set: 3497
Percentage of the train set: 0.6958820495185695
Percentage of the validation set: 0.15379986244841815
Percentage of the test set: 0.1503180880330124


In [52]:
print("Percentage of green codes in train: ", sum(train_df["green_code"])/train_df.shape[0])
print("Percentage of green codes in validation: ", sum(val_df["green_code"])/val_df.shape[0])
print("Percentage of green codes in test: ", sum(test_df["green_code"])/test_df.shape[0])
print("Percentage of green codes in inference: ", sum(inference_df["green_code"])/inference_df.shape[0])

Percentage of green codes in train:  0.14855766261041448
Percentage of green codes in validation:  0.1811067635550587
Percentage of green codes in test:  0.12896768658850444
Percentage of green codes in inference:  0.03909924116604316


In [64]:
# non-green as train
# train: 70% non-Green
# validation: 15% non-Green
# test: 15% non-Green
# inference: 100% Green

# split on manifestos:
manifesto_ids = other_df["manifesto_id"].unique()
np.random.seed(0) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[56:] # 
val_manifesto_ids = manifesto_ids[:28] # 
test_manifesto_ids = manifesto_ids[28:56] # 

train_df = other_df[other_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = other_df[other_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = other_df[other_df["manifesto_id"].isin(test_manifesto_ids)]
inference_df = green_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/other_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/other_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/other_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_df = inference_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
train_df.to_csv("data/model_splits/green_split/non_green_as_train/train-00000-of-00001.csv", index=False)
val_df.to_csv("data/model_splits/green_split/non_green_as_train/validation-00000-of-00001.csv", index=False)
test_df.to_csv("data/model_splits/green_split/non_green_as_train/test-00000-of-00001.csv", index=False)
inference_df.to_csv("data/model_splits/green_split/non_green_as_train/inference-00000-of-00001.csv", index=False)

Number of q_sentences in the training set: 123816
Number of q_sentences in the validation set: 26220
Number of q_sentences in the test set: 28791
Percentage of the train set: 0.6923786676508581
Percentage of the validation set: 0.14662215437266185
Percentage of the test set: 0.16099917797648006


In [65]:
print("Percentage of green codes in train: ", sum(train_df["green_code"])/train_df.shape[0])
print("Percentage of green codes in validation: ", sum(val_df["green_code"])/val_df.shape[0])
print("Percentage of green codes in test: ", sum(test_df["green_code"])/test_df.shape[0])
print("Percentage of green codes in inference: ", sum(inference_df["green_code"])/inference_df.shape[0])

Percentage of green codes in train:  0.03909026297085998
Percentage of green codes in validation:  0.037795575896262396
Percentage of green codes in test:  0.0403251015942482
Percentage of green codes in inference:  0.15061898211829436
