In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
pd.options.display.max_columns = None

In [3]:
# load the MP q_sentence data and the party split info
corpus_df = pd.read_csv("data/english_annotated_full_df.csv") # full df or reduced df?
split_df = pd.read_csv("data/party_split.csv")

In [4]:
# reset the index of the corpus_df and save it as a column. This way we can always find and compare q_sentences later on
corpus_df = corpus_df.reset_index(names="original_index")

In [5]:
# some q_sentences are very long (and don't follow the rules of max. 1 sentence)
# this will be a problem for the max token input of roBERTa of 512 (esp. for the context model)
# we we will remove all q_sentences that have 100 or more words
#  these are 96 q_sentences in total, so not very many
corpus_df = corpus_df[corpus_df["q_sentence_words"] < 100]

In [6]:
corpus_df.head(2)

Unnamed: 0,original_index,q_sentence,q_sentence_nr,codes,manifesto_id,party,date,language,handbook,title,coderid,countryname,partyname,pervote,q_sentence_chars,q_sentence_words,main_codes,RILE
0,0,Which party will make a real difference to my ...,1,,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,64,11,-1,0
1,1,Our aim: To make Britain the world's foremost ...,2,411.0,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,71,12,411,0


In [7]:
split_df.head(2)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,Center,No
1,51902_199705,1997,United Kingdom,Scottish National Party,,Left,No


In [8]:
split_df.shape

(209, 7)

In [9]:
split_df["left_right"].value_counts(dropna=False)

left_right
Left            80
Right           64
Center          22
Center-right    22
Center-left     15
NaN              4
Far-left         2
Name: count, dtype: int64

In [10]:
split_df["green"].value_counts(dropna=False)

green
No     189
Yes     20
Name: count, dtype: int64

In [11]:
# replace NAs in left_right with "Unknown"
split_df['left_right'] = split_df['left_right'].fillna("Unknown")

In [12]:
# Recode the columns
# TODO: FIX THIS FOR Left vs. center+right and right vs. center+left
# LEFT = 1, CENTER = 0, RIGHT = 2, Unknown = -1 --> similar to RILE coding in the corpus_dfs
left_right_dict = {"Far-left": 1,
                   "Left": 1,
                   "Center-left": 1,
                   "Center": 0,
                   "Center-right": 2,
                   "Right": 2,
                   "Unknown": -1}
split_df = split_df.assign(left_right  = split_df.left_right.map(left_right_dict))

# Green = 1
green_dict = {"Yes": 1,
              "No": 0}
split_df = split_df.assign(green  = split_df.green.map(green_dict))


In [13]:
split_df.head(2)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,0,0
1,51902_199705,1997,United Kingdom,Scottish National Party,,1,0


In [14]:
split_df["left_right"].value_counts()

left_right
 1    97
 2    86
 0    22
-1     4
Name: count, dtype: int64

In [15]:
# merge left_right to the corpus_df:
corpus_df = corpus_df.merge(split_df, on="manifesto_id", how="left")

In [16]:
# add a binary label column for green codes (so code 501 is 1, rest is 0)
corpus_df["green_code"] = [1 if x == 501 else 0 for x in corpus_df["main_codes"]]

In [17]:
corpus_df.head(2)

Unnamed: 0,original_index,q_sentence,q_sentence_nr,codes,manifesto_id,party,date,language,handbook,title,coderid,countryname_x,partyname_x,pervote,q_sentence_chars,q_sentence_words,main_codes,RILE,year,countryname_y,partyname_y,migration_positive,left_right,green,green_code
0,0,Which party will make a real difference to my ...,1,,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,64,11,-1,0,1997,United Kingdom,Liberal Democrats,,0,0,0
1,1,Our aim: To make Britain the world's foremost ...,2,411.0,51421_199705,51421,199705,english,1,Make the Difference,102,United Kingdom,Liberal Democrats,16.758,71,12,411,0,1997,United Kingdom,Liberal Democrats,,0,0,0


In [18]:
# get all green parties:
corpus_df[corpus_df["green"] == 1]["partyname_x"].unique() # careful, two different parties are named "Green Party"

array(['Green Party of England and Wales', 'Green Party',
       'Australian Greens', 'Green Party of Aotearoa New Zealand'],
      dtype=object)

## Left-Right Split:

In [19]:
# this removes the 4 "unknown" parties
left_df = corpus_df[corpus_df["left_right"] == 1]
right_df = corpus_df[corpus_df["left_right"] == 2]
center_df = corpus_df[corpus_df["left_right"] == 0]

What are top frequent codes for the different sets? How many R/L/N are there in RILE?

In [20]:
(left_df["main_codes"].value_counts()/(left_df.shape[0]))[0:10]

main_codes
 504    0.111608
 501    0.072527
-1      0.068304
 503    0.065650
 411    0.064311
 0      0.061849
 506    0.053367
 701    0.038924
 416    0.031673
 403    0.031250
Name: count, dtype: float64

In [21]:
left_df["RILE"].value_counts()/left_df.shape[0]

RILE
0    0.556486
1    0.314989
2    0.128525
Name: count, dtype: float64

In [22]:
(right_df["main_codes"].value_counts()/(right_df.shape[0]))[0:10]

main_codes
 504    0.084268
 411    0.071210
-1      0.063574
 0      0.059953
 605    0.057040
 703    0.044648
 506    0.043296
 503    0.040863
 410    0.038060
 402    0.035343
Name: count, dtype: float64

In [23]:
right_df["RILE"].value_counts()/right_df.shape[0]

RILE
0    0.494655
2    0.275035
1    0.230310
Name: count, dtype: float64

In [24]:
(center_df["main_codes"].value_counts()/(center_df.shape[0]))[0:10]

main_codes
 504    0.116494
-1      0.095947
 411    0.060057
 506    0.051687
 0      0.050028
 501    0.049840
 605    0.043129
 503    0.040226
 303    0.037210
 301    0.033893
Name: count, dtype: float64

In [25]:
center_df["RILE"].value_counts()/center_df.shape[0]

RILE
0    0.551593
1    0.275137
2    0.173270
Name: count, dtype: float64

In [26]:
# what columns do we need in the data for the model?
relevant_cols = ["q_sentence", "q_sentence_nr", "manifesto_id", "main_codes", "RILE", "original_index"]
left_df = left_df[relevant_cols]
right_df = right_df[relevant_cols]
center_df = center_df[relevant_cols]

In [27]:
# left as train
# train: 70% Left
# validation: 15% Left
# test: 15% Left
# inference_right: 100% Right
# inference_center: 100% Center

# split on manifestos:
manifesto_ids = left_df["manifesto_id"].unique()
np.random.seed(6) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[28:] # 10 manifestos
val_manifesto_ids = manifesto_ids[:13] # 13
test_manifesto_ids = manifesto_ids[13:28] # 7 manifestos

train_df = left_df[left_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = left_df[left_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = left_df[left_df["manifesto_id"].isin(test_manifesto_ids)]
inference_right_df = right_df.copy()
inference_center_df = center_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/left_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/left_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/left_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_right_df = inference_right_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_center_df = inference_center_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_right_df.reset_index(drop=True, inplace=True)
inference_center_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/left_right_split/left_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/left_right_split/left_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/left_right_split/left_as_train/test-00000-of-00001.csv", index=False)
# inference_right_df.to_csv("data/model_splits/left_right_split/left_as_train/inference_right-00000-of-00001.csv", index=False)
# inference_center_df.to_csv("data/model_splits/left_right_split/left_as_train/inference_center-00000-of-00001.csv", index=False)


Number of q_sentences in the training set: 58067
Number of q_sentences in the validation set: 12284
Number of q_sentences in the test set: 12528
Percentage of the train set: 0.7006238009628495
Percentage of the validation set: 0.14821607403564233
Percentage of the test set: 0.15116012500150822


In [28]:
len(manifesto_ids)

97

In [29]:
print("RILE proportions in train:\n", train_df["RILE"].value_counts()/train_df.shape[0])
print("\nRILE proportions in validation:\n", val_df["RILE"].value_counts()/val_df.shape[0])
print("\nRILE proportions in test:\n", test_df["RILE"].value_counts()/test_df.shape[0])

RILE proportions in train:
 RILE
0    0.545404
1    0.322524
2    0.132072
Name: count, dtype: float64

RILE proportions in validation:
 RILE
0    0.581895
1    0.291192
2    0.126913
Name: count, dtype: float64

RILE proportions in test:
 RILE
0    0.582934
1    0.303400
2    0.113665
Name: count, dtype: float64


In [30]:
print("\nRILE proportions in inference right:\n", inference_right_df["RILE"].value_counts()/inference_right_df.shape[0])
print("\nRILE proportions in inference center:\n", inference_center_df["RILE"].value_counts()/inference_center_df.shape[0])


RILE proportions in inference right:
 RILE
0    0.494655
2    0.275035
1    0.230310
Name: count, dtype: float64

RILE proportions in inference center:
 RILE
0    0.551593
1    0.275137
2    0.173270
Name: count, dtype: float64


In [31]:
# right as train
# train: 70% right
# validation: 15% right
# test: 15% right
# inference_left: 100% Left
# inference_center: 100% Center

# split on manifestos:
manifesto_ids = right_df["manifesto_id"].unique()
np.random.seed(19) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[26:] #
val_manifesto_ids = manifesto_ids[:13] #
test_manifesto_ids = manifesto_ids[13:26] #

train_df = right_df[right_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = right_df[right_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = right_df[right_df["manifesto_id"].isin(test_manifesto_ids)]
inference_left_df = left_df.copy()
inference_center_df = center_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/right_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/right_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/right_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_left_df = inference_left_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_center_df = inference_center_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_left_df.reset_index(drop=True, inplace=True)
inference_center_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/left_right_split/right_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/left_right_split/right_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/left_right_split/right_as_train/test-00000-of-00001.csv", index=False)
# inference_left_df.to_csv("data/model_splits/left_right_split/right_as_train/inference_left-00000-of-00001.csv", index=False)
# inference_center_df.to_csv("data/model_splits/left_right_split/right_as_train/inference_center-00000-of-00001.csv", index=False)


Number of q_sentences in the training set: 64260
Number of q_sentences in the validation set: 13335
Number of q_sentences in the test set: 14077
Percentage of the train set: 0.7009773976786805
Percentage of the validation set: 0.14546426389737324
Percentage of the test set: 0.15355833842394626


In [32]:
len(manifesto_ids)

86

In [33]:
print("RILE proportions in train:\n", train_df["RILE"].value_counts()/train_df.shape[0])
print("\nRILE proportions in validation:\n", val_df["RILE"].value_counts()/val_df.shape[0])
print("\nRILE proportions in test:\n", test_df["RILE"].value_counts()/test_df.shape[0])

RILE proportions in train:
 RILE
0    0.497915
2    0.279925
1    0.222160
Name: count, dtype: float64

RILE proportions in validation:
 RILE
0    0.455418
2    0.283165
1    0.261417
Name: count, dtype: float64

RILE proportions in test:
 RILE
0    0.516943
2    0.245010
1    0.238048
Name: count, dtype: float64


In [34]:
print("\nRILE proportions in inference left:\n", inference_left_df["RILE"].value_counts()/inference_left_df.shape[0])
print("\nRILE proportions in inference center:\n", inference_center_df["RILE"].value_counts()/inference_center_df.shape[0])


RILE proportions in inference left:
 RILE
0    0.556486
1    0.314989
2    0.128525
Name: count, dtype: float64

RILE proportions in inference center:
 RILE
0    0.551593
1    0.275137
2    0.173270
Name: count, dtype: float64


## Green Split
Creating Training data based on non-green parties (and an inference set with the green parties)

In [35]:
corpus_df["green"].value_counts()

green
0    178827
1     23264
Name: count, dtype: int64

In [36]:
green_df = corpus_df[corpus_df["green"] == 1]
other_df = corpus_df[corpus_df["green"] == 0]

In [37]:
# what columns do we need in the data for the model?
relevant_cols = ["q_sentence", "q_sentence_nr", "manifesto_id", "main_codes", "green_code", "original_index"]
green_df = green_df[relevant_cols]
other_df = other_df[relevant_cols]

In [38]:
# green as train
# train: 70% Green
# validation: 15% Green
# test: 15% Green
# inference: 100% non-Green

# split on manifestos:
manifesto_ids = green_df["manifesto_id"].unique()
np.random.seed(13) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[10:] # 10 manifestos
val_manifesto_ids = manifesto_ids[:3] # this is 3 manifestos
test_manifesto_ids = manifesto_ids[3:10] # 7 manifestos

train_df = green_df[green_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = green_df[green_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = green_df[green_df["manifesto_id"].isin(test_manifesto_ids)]
inference_df = other_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/green_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/green_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/green_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_df = inference_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/green_split/green_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/green_split/green_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/green_split/green_as_train/test-00000-of-00001.csv", index=False)
# inference_df.to_csv("data/model_splits/green_split/green_as_train/inference-00000-of-00001.csv", index=False)


Number of q_sentences in the training set: 16189
Number of q_sentences in the validation set: 3578
Number of q_sentences in the test set: 3497
Percentage of the train set: 0.6958820495185695
Percentage of the validation set: 0.15379986244841815
Percentage of the test set: 0.1503180880330124


In [39]:
len(manifesto_ids)

20

In [40]:
print("Percentage of green codes in train: ", sum(train_df["green_code"])/train_df.shape[0])
print("Percentage of green codes in validation: ", sum(val_df["green_code"])/val_df.shape[0])
print("Percentage of green codes in test: ", sum(test_df["green_code"])/test_df.shape[0])
print("Percentage of green codes in inference: ", sum(inference_df["green_code"])/inference_df.shape[0])

Percentage of green codes in train:  0.14855766261041448
Percentage of green codes in validation:  0.1811067635550587
Percentage of green codes in test:  0.12896768658850444
Percentage of green codes in inference:  0.03909924116604316


In [41]:
# non-green as train
# train: 70% non-Green
# validation: 15% non-Green
# test: 15% non-Green
# inference: 100% Green

# split on manifestos:
manifesto_ids = other_df["manifesto_id"].unique()
np.random.seed(0) # keep it reproducible (and so that ca. 10% of the q_sentences land in the validation set)
np.random.shuffle(manifesto_ids)

# select manifestos into the different sets (so that about 15% of q_sentences are in the validation and test sets, see below)
train_manifesto_ids = manifesto_ids[56:] # 
val_manifesto_ids = manifesto_ids[:28] # 
test_manifesto_ids = manifesto_ids[28:56] # 

train_df = other_df[other_df["manifesto_id"].isin(train_manifesto_ids)]
val_df = other_df[other_df["manifesto_id"].isin(val_manifesto_ids)]
test_df = other_df[other_df["manifesto_id"].isin(test_manifesto_ids)]
inference_df = green_df.copy()

print("Number of q_sentences in the training set:", train_df.shape[0])
print("Number of q_sentences in the validation set:", val_df.shape[0])
print("Number of q_sentences in the test set:", test_df.shape[0])
print("Percentage of the train set:", train_df.shape[0]/other_df.shape[0])
print("Percentage of the validation set:", val_df.shape[0]/other_df.shape[0])
print("Percentage of the test set:", test_df.shape[0]/other_df.shape[0])

# make sure they are sorted correctly (important for adding the context later on)
train_df = train_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
val_df = val_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
test_df = test_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)
inference_df = inference_df.sort_values(["manifesto_id", "q_sentence_nr"], ascending=True)

# and reset the indicies
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
inference_df.reset_index(drop=True, inplace=True)

# and now save them as csv so that they can be loaded into huggingface
# train_df.to_csv("data/model_splits/green_split/non_green_as_train/train-00000-of-00001.csv", index=False)
# val_df.to_csv("data/model_splits/green_split/non_green_as_train/validation-00000-of-00001.csv", index=False)
# test_df.to_csv("data/model_splits/green_split/non_green_as_train/test-00000-of-00001.csv", index=False)
# inference_df.to_csv("data/model_splits/green_split/non_green_as_train/inference-00000-of-00001.csv", index=False)

Number of q_sentences in the training set: 123816
Number of q_sentences in the validation set: 26220
Number of q_sentences in the test set: 28791
Percentage of the train set: 0.6923786676508581
Percentage of the validation set: 0.14662215437266185
Percentage of the test set: 0.16099917797648006


In [42]:
len(manifesto_ids)

189

In [43]:
print("Percentage of green codes in train: ", sum(train_df["green_code"])/train_df.shape[0])
print("Percentage of green codes in validation: ", sum(val_df["green_code"])/val_df.shape[0])
print("Percentage of green codes in test: ", sum(test_df["green_code"])/test_df.shape[0])
print("Percentage of green codes in inference: ", sum(inference_df["green_code"])/inference_df.shape[0])

Percentage of green codes in train:  0.03909026297085998
Percentage of green codes in validation:  0.037795575896262396
Percentage of green codes in test:  0.0403251015942482
Percentage of green codes in inference:  0.15061898211829436
