In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
pd.options.display.max_columns = None

In [None]:
# load the MP q_sentence data and the party split info
corpus_df = pd.read_csv("data/english_annotated_full_df.csv") # full df or reduced df?
split_df = pd.read_csv("data/party_split.csv")

In [None]:
# reset the index of the corpus_df and save it as a column. This way we can always find and compare q_sentences later on
corpus_df = corpus_df.reset_index(names="original_index")

In [None]:
# some q_sentences are very long (and don't follow the rules of max. 1 sentence)
# this will be a problem for the max token input of roBERTa of 512 (esp. for the context model)
# we we will remove all q_sentences that have 100 or more words
#  these are 96 q_sentences in total, so not very many
corpus_df = corpus_df[corpus_df["q_sentence_words"] < 100]

In [None]:
corpus_df.head(2)

In [None]:
split_df.head(2)

In [None]:
split_df.shape

In [None]:
split_df["left_right"].value_counts(dropna=False)

In [None]:
split_df["green"].value_counts(dropna=False)

In [None]:
# replace NAs in left_right with "Unknown"
split_df['left_right'] = split_df['left_right'].fillna("Unknown")

In [None]:
# Recode the columns
# LEFT = 1, CENTER = 0, RIGHT = 2, Unknown = -1 --> similar to RILE coding in the corpus_dfs
left_right_dict = {"Far-left": 1,
                   "Left": 1,
                   "Center-left": 1,
                   "Center": 0,
                   "Center-right": 2,
                   "Right": 2,
                   "Unknown": -1}
split_df = split_df.assign(left_right  = split_df.left_right.map(left_right_dict))

# Green = 1
green_dict = {"Yes": 1,
              "No": 0}
split_df = split_df.assign(green  = split_df.green.map(green_dict))


In [None]:
split_df.head(2)

In [None]:
split_df["left_right"].value_counts()

In [None]:
# merge left_right to the corpus_df:
corpus_df = corpus_df.merge(split_df, on="manifesto_id", how="left")

In [None]:
# add a binary label column for green codes (so code 501 is 1, rest is 0)
corpus_df["green_code"] = [1 if x == 501 else 0 for x in corpus_df["main_codes"]]

In [None]:
corpus_df.head(2)

In [None]:
# get all green parties:
corpus_df[corpus_df["green"] == 1]["partyname_x"].unique() # careful, two different parties are named "Green Party"

## Left-Right Split:

In [None]:
# this removes the 4 "unknown" parties
left_df = corpus_df[corpus_df["left_right"] == 1]
right_df = corpus_df[corpus_df["left_right"] == 2]
center_df = corpus_df[corpus_df["left_right"] == 0]

What are top frequent codes for the different sets? How many R/L/N are there in RILE?

In [None]:
(left_df["main_codes"].value_counts()/(left_df.shape[0]))[0:10]

In [None]:
left_df["RILE"].value_counts()/left_df.shape[0]

In [None]:
(right_df["main_codes"].value_counts()/(right_df.shape[0]))[0:10]

In [None]:
right_df["RILE"].value_counts()/right_df.shape[0]

In [None]:
(center_df["main_codes"].value_counts()/(center_df.shape[0]))[0:10]

In [None]:
center_df["RILE"].value_counts()/center_df.shape[0]

In [None]:
# what columns do we need in the data for the model?
relevant_cols = ["q_sentence", "q_sentence_nr", "manifesto_id", "main_codes", "RILE", "original_index"]
left_df = left_df[relevant_cols]
right_df = right_df[relevant_cols]
center_df = center_df[relevant_cols]

In [None]:
# left as train
# train: 70% Left
# validation: 15% Left
# test: 15% Left
# inference_right: 100% Right
# inference_center: 100% Center

# split on manifestos:
manifesto_ids = left_df["manifesto_id"].unique()
np.random.seed(6) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[28:] # 10 manifestos
val_manifesto_ids = manifesto_ids[:13] # 13
test_manifesto_ids = manifesto_ids[13:28] # 7 manifestos

train_df = left_df[left_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = left_df[left_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = left_df[left_df["manifesto_id"].isin(test_manifesto_ids)]
inference_right_df = right_df.copy()
inference_center_df = center_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/left_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/left_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/left_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_right_df = inference_right_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_center_df = inference_center_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_right_df.reset_index(drop=True, inplace=True)
inference_center_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/left_right_split/left_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/left_right_split/left_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/left_right_split/left_as_train/test-00000-of-00001.csv", index=False)
# inference_right_df.to_csv("data/model_splits/left_right_split/left_as_train/inference_right-00000-of-00001.csv", index=False)
# inference_center_df.to_csv("data/model_splits/left_right_split/left_as_train/inference_center-00000-of-00001.csv", index=False)


In [None]:
len(manifesto_ids)

In [None]:
print("RILE proportions in train:\n", train_df["RILE"].value_counts()/train_df.shape[0])
print("\nRILE proportions in validation:\n", val_df["RILE"].value_counts()/val_df.shape[0])
print("\nRILE proportions in test:\n", test_df["RILE"].value_counts()/test_df.shape[0])

In [None]:
print("\nRILE proportions in inference right:\n", inference_right_df["RILE"].value_counts()/inference_right_df.shape[0])
print("\nRILE proportions in inference center:\n", inference_center_df["RILE"].value_counts()/inference_center_df.shape[0])

In [None]:
# right as train
# train: 70% right
# validation: 15% right
# test: 15% right
# inference_left: 100% Left
# inference_center: 100% Center

# split on manifestos:
manifesto_ids = right_df["manifesto_id"].unique()
np.random.seed(19) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[26:] #
val_manifesto_ids = manifesto_ids[:13] #
test_manifesto_ids = manifesto_ids[13:26] #

train_df = right_df[right_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = right_df[right_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = right_df[right_df["manifesto_id"].isin(test_manifesto_ids)]
inference_left_df = left_df.copy()
inference_center_df = center_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/right_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/right_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/right_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_left_df = inference_left_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_center_df = inference_center_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_left_df.reset_index(drop=True, inplace=True)
inference_center_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/left_right_split/right_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/left_right_split/right_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/left_right_split/right_as_train/test-00000-of-00001.csv", index=False)
# inference_left_df.to_csv("data/model_splits/left_right_split/right_as_train/inference_left-00000-of-00001.csv", index=False)
# inference_center_df.to_csv("data/model_splits/left_right_split/right_as_train/inference_center-00000-of-00001.csv", index=False)


In [None]:
len(manifesto_ids)

In [None]:
print("RILE proportions in train:\n", train_df["RILE"].value_counts()/train_df.shape[0])
print("\nRILE proportions in validation:\n", val_df["RILE"].value_counts()/val_df.shape[0])
print("\nRILE proportions in test:\n", test_df["RILE"].value_counts()/test_df.shape[0])

In [None]:
print("\nRILE proportions in inference left:\n", inference_left_df["RILE"].value_counts()/inference_left_df.shape[0])
print("\nRILE proportions in inference center:\n", inference_center_df["RILE"].value_counts()/inference_center_df.shape[0])

## Green Split
Creating Training data based on non-green parties (and an inference set with the green parties)

In [None]:
corpus_df["green"].value_counts()

In [None]:
green_df = corpus_df[corpus_df["green"] == 1]
other_df = corpus_df[corpus_df["green"] == 0]

In [None]:
# what columns do we need in the data for the model?
relevant_cols = ["q_sentence", "q_sentence_nr", "manifesto_id", "main_codes", "green_code", "original_index"]
green_df = green_df[relevant_cols]
other_df = other_df[relevant_cols]

In [None]:
# green as train
# train: 70% Green
# validation: 15% Green
# test: 15% Green
# inference: 100% non-Green

# split on manifestos:
manifesto_ids = green_df["manifesto_id"].unique()
np.random.seed(13) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[10:] # 10 manifestos
val_manifesto_ids = manifesto_ids[:3] # this is 3 manifestos
test_manifesto_ids = manifesto_ids[3:10] # 7 manifestos

train_df = green_df[green_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = green_df[green_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = green_df[green_df["manifesto_id"].isin(test_manifesto_ids)]
inference_df = other_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/green_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/green_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/green_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_df = inference_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/green_split/green_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/green_split/green_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/green_split/green_as_train/test-00000-of-00001.csv", index=False)
# inference_df.to_csv("data/model_splits/green_split/green_as_train/inference-00000-of-00001.csv", index=False)


In [None]:
len(manifesto_ids)

In [None]:
print("Percentage of green codes in train: ", sum(train_df["green_code"])/train_df.shape[0])
print("Percentage of green codes in validation: ", sum(val_df["green_code"])/val_df.shape[0])
print("Percentage of green codes in test: ", sum(test_df["green_code"])/test_df.shape[0])
print("Percentage of green codes in inference: ", sum(inference_df["green_code"])/inference_df.shape[0])

In [None]:
# non-green as train
# train: 70% non-Green
# validation: 15% non-Green
# test: 15% non-Green
# inference: 100% Green

# split on manifestos:
manifesto_ids = other_df["manifesto_id"].unique()
np.random.seed(0) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[56:] # 
val_manifesto_ids = manifesto_ids[:28] # 
test_manifesto_ids = manifesto_ids[28:56] # 

train_df = other_df[other_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = other_df[other_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = other_df[other_df["manifesto_id"].isin(test_manifesto_ids)]
inference_df = green_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/other_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/other_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/other_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_df = inference_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/green_split/non_green_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/green_split/non_green_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/green_split/non_green_as_train/test-00000-of-00001.csv", index=False)
# inference_df.to_csv("data/model_splits/green_split/non_green_as_train/inference-00000-of-00001.csv", index=False)

In [None]:
len(manifesto_ids)

In [None]:
print("Percentage of green codes in train: ", sum(train_df["green_code"])/train_df.shape[0])
print("Percentage of green codes in validation: ", sum(val_df["green_code"])/val_df.shape[0])
print("Percentage of green codes in test: ", sum(test_df["green_code"])/test_df.shape[0])
print("Percentage of green codes in inference: ", sum(inference_df["green_code"])/inference_df.shape[0])