In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [81]:
# load the MP q_sentence data and the party split info
corpus_df = pd.read_csv("data/english_annotated_full_df.csv")
split_df = pd.read_csv("data/party_split.csv")

In [82]:
# replace NAs in left_right with "Unknown"
split_df['left_right'] = split_df['left_right'].fillna("Unknown")

In [83]:
split_df.head(1)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,Center,No


In [84]:
# Recode the columns
# LEFT = 0, RIGHT = 1, Unknown = -1
left_right_dict = {"Far-left": 0,
                   "Left": 0,
                   "Center-left": 0,
                   "Centre": 1,
                   "Center": 1,
                   "Center-right": 1,
                   "Right": 1,
                   "Unknown": -1}
split_df = split_df.assign(left_right  = split_df.left_right.map(left_right_dict))

# Green = 1
green_dict = {"Yes": 1,
              "No": 0}
split_df = split_df.assign(green  = split_df.green.map(green_dict))


In [85]:
split_df.head(1)

Unnamed: 0,manifesto_id,year,countryname,partyname,migration_positive,left_right,green
0,51421_199705,1997,United Kingdom,Liberal Democrats,,1,0


### Creating training data based on left-right Parties

Training data using only left parties, split on a manifesto level into train and test sets (10% Test)

Inference dataset with all the rest

In [86]:
# merge left_right to the corpus_df:
corpus_df = corpus_df.merge(split_df, on="manifesto_id", how="left")

In [87]:
left_df = corpus_df[corpus_df["left_right"] == 0]
other_df = corpus_df[corpus_df["left_right"] != 0]

In [104]:
manifesto_ids = left_df["manifesto_id"].unique()
# shuffle them
np.random.shuffle(manifesto_ids)

train_manifesto_ids = manifesto_ids[10:]
test_manifesto_ids = manifesto_ids[:10]

left_df_train = left_df[left_df["manifesto_id"].isin(train_manifesto_ids)]
left_df_test = left_df[left_df["manifesto_id"].isin(test_manifesto_ids)]

In [105]:
left_df_train.head()

Unnamed: 0,q_sentence,q_sentence_nr,codes,manifesto_id,party,date,language,handbook,title,coderid,...,q_sentence_chars,q_sentence_words,main_codes,RILE,year,countryname_y,partyname_y,migration_positive,left_right,green
972,What Scotland needs now!,1,,51902_199705,51902,199705,english,3,Yes we can win the best for Scotland,275,...,24,4,-1,0,1997,United Kingdom,Scottish National Party,,0,0
973,“The SNP are proposing a fully-costed manifest...,2,,51902_199705,51902,199705,english,3,Yes we can win the best for Scotland,275,...,137,21,-1,0,1997,United Kingdom,Scottish National Party,,0,0
974,There is a better future for Scotland with Ind...,3,,51902_199705,51902,199705,english,3,Yes we can win the best for Scotland,275,...,153,25,-1,0,1997,United Kingdom,Scottish National Party,,0,0
975,”,4,,51902_199705,51902,199705,english,3,Yes we can win the best for Scotland,275,...,1,1,-1,0,1997,United Kingdom,Scottish National Party,,0,0
976,Scotland Needs BETTER HEALTH - new investment ...,5,504.0,51902_199705,51902,199705,english,3,Yes we can win the best for Scotland,275,...,167,27,504,1,1997,United Kingdom,Scottish National Party,,0,0
