In [38]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [39]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [40]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [41]:
from tqdm import tqdm

In [42]:
import re

In [43]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [44]:
vote_file = pd.read_csv('brexit_full_updated4.csv')
vote_file

Unnamed: 0.1,Unnamed: 0,Constituency ID,full name,PANO,Constituency Name,Party abbreviation,conShare,conShare2,conShare3,bill161,leave,party_old,ref,ref_dummy,party,old_party
0,0,E14000530,Gerald Howarth,7,Aldershot,Con,50.59,73.403947,73.403947,1.0,1.0,1.0,0.578978,1,1.0,1.0
1,1,E14000531,Wendy Morton,8,Aldridge-Brownhills,Con,52.05,69.940876,69.940876,1.0,0.0,1.0,0.677963,1,,0.0
2,2,E14000532,Graham Brady,9,Altrincham and Sale West,Con,52.99,66.503514,66.503514,1.0,1.0,1.0,0.385878,0,1.0,1.0
3,3,E14000533,Nigel Mills,11,Amber Valley,Con,43.98,55.840528,55.840528,1.0,1.0,1.0,0.652991,1,1.0,1.0
4,4,E14000534,Nick Herbert,18,Arundel and South Downs,Con,60.79,84.442284,84.442284,,0.0,1.0,0.497011,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,655,W07000076,Wayne David,114,Caerphilly,Lab,16.59,27.223499,27.223499,1.0,0.0,0.0,0.551360,1,0.0,
656,656,W07000077,Chris Evans,336,Islwyn,Lab,15.16,23.639482,23.639482,1.0,0.0,0.0,0.589399,1,0.0,
657,657,W07000078,Alun Cairns,589,Vale Of Glamorgan,Con,46.02,58.527280,58.527280,1.0,0.0,1.0,0.525507,1,0.0,0.0
658,658,W07000079,Kevin Brennan,129,Cardiff West,Lab,25.15,38.221884,38.221884,0.0,0.0,0.0,0.438226,0,0.0,


In [45]:
vote_file['party'].value_counts()

0.0    449
1.0    145
Name: party, dtype: int64

In [5]:
def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return t
        
    return topic

In [6]:
politicians_party = {}
politicians_ref = {}
politicians = vote_file['full name'].unique()
for index, row in vote_file.iterrows():
    politicians_party[row['full name']] = row['party']
    politicians_ref[row['full name']] = row['ref_dummy']

In [7]:
df1 = pd.read_csv('2015_commons.csv')
df2 = pd.read_csv('2016_commons.csv')
df = pd.concat([df1, df2])
df = df.drop(['Government'], axis=1)
df = df.loc[(df['Party'] == 'Conservative') | (df['Party'] == 'Labour')]
df = df.loc[df['Name'].isin(politicians)]
df['Date'] = pd.to_datetime(df['Date'])
mask = (df['Date'] > '2016-6-1') & (df['Date'] <= '2017-6-30')
df = df.loc[mask]
df['party2'] = df['Name'].apply(lambda x: politicians_party[x])
df['ref_dummy'] = df['Name'].apply(lambda x: politicians_ref[x])
df['Topic'] = df['Topic'].apply(lambda x: preprocess(x))
# df = df.loc[df['Topic'] == 'Parliament, government and politics']
# df = df.loc[df['Topic'] == 'European Union']
df.head()

Unnamed: 0,Date,Speaker,Name,Party,Topic,Speech,party2,ref_dummy
31481,2016-06-06,Sarah Champion (Rotherham) (Lab),Sarah Champion,Labour,Communities and families,1. What assessment his Department has made of...,0.0,1
31482,2016-06-06,The Secretary of State for Communities and Loc...,Greg Clark,Conservative,Communities and families,"Domestic abuse is a devastating crime, and we ...",0.0,0
31483,2016-06-06,Sarah Champion,Sarah Champion,Labour,Communities and families,The Secretary of State knows how devastating d...,0.0,1
31484,2016-06-06,Greg Clark,Greg Clark,Conservative,Communities and families,"Yes, it is important that we have specialist s...",0.0,0
31485,2016-06-06,Jon Trickett (Hemsworth) (Lab),Jon Trickett,Labour,Communities and families,I welcome what the Secretary of State has just...,0.0,1


In [8]:
df1 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 1) & (df['ref_dummy'] == 0)]
df2 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 1) & (df['ref_dummy'] == 1)]
df3 = df.loc[(df['Party'] == 'Labour') & (df['party2'] == 0) & (df['ref_dummy'] == 0)]
df4 = df.loc[(df['Party'] == 'Labour') & (df['party2'] == 0) & (df['ref_dummy'] == 1)]
df5 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 0) & (df['ref_dummy'] == 1)]
df6 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 0) & (df['ref_dummy'] == 0)]

In [15]:
bigrams1 = []
bigrams2 = []
bigrams3 = []
bigrams4 = []
bigrams5 = []
bigrams6 = []

In [10]:
def get_bigrams(text):
    #To remove stop words
    stopW = stopwords.words('english')

    #To stem
    ps = PorterStemmer()

    #Tokenize

    clean_transcript = text.lower()
    clean_transcript = word_tokenize(clean_transcript)
    #Remove digits
    clean_transcript = [i for i in clean_transcript if not re.match(r'\d+', i)]
    #Remove Stopwords and single characters
    clean_transcript = [i for i in clean_transcript if i not in stopW and len(i) > 1]
    #Stemming
    clean_transcript = [ps.stem(word) for word in clean_transcript]
    
    bigr = []
    n_len = len(clean_transcript)
    for i in range(n_len-1):
        bigr.append(clean_transcript[i] + '.' + clean_transcript[i+1])
    return bigr

In [17]:
for index, row in df1.iterrows():
    bigr = get_bigrams(row['Speech'])
    for x in bigr:
        if x not in bigrams1:
            bigrams1.append(x)
for index, row in df2.iterrows():
    bigr = get_bigrams(row['Speech'])
    for x in bigr:
        if x not in bigrams2:
            bigrams2.append(x)
for index, row in df3.iterrows():
    bigr = get_bigrams(row['Speech'])
    for x in bigr:
        if x not in bigrams3:
            bigrams3.append(x)
for index, row in df4.iterrows():
    bigr = get_bigrams(row['Speech'])
    for x in bigr:
        if x not in bigrams4:
            bigrams4.append(x)
for index, row in df5.iterrows():
    bigr = get_bigrams(row['Speech'])
    for x in bigr:
        if x not in bigrams5:
            bigrams5.append(x)
for index, row in df6.iterrows():
    bigr = get_bigrams(row['Speech'])
    for x in bigr:
        if x not in bigrams6:
            bigrams6.append(x)

In [18]:
unq1 = []
unq2 = []
unq3 = []
unq4 = []
unq5 = []
unq6 = []


In [22]:
for bigr in tqdm(bigrams1):
    if (bigr not in bigrams2) and (bigr not in bigrams3) and (bigr not in bigrams4) and (bigr not in bigrams5) and (bigr not in bigrams6):
        unq1.append(bigr)

100%|██████████| 50542/50542 [16:58<00:00, 49.62it/s] 


In [23]:
for bigr in tqdm(bigrams2):
    if (bigr not in bigrams1) and (bigr not in bigrams3) and (bigr not in bigrams4) and (bigr not in bigrams5) and (bigr not in bigrams6):
        unq2.append(bigr)

100%|██████████| 199466/199466 [1:07:33<00:00, 49.21it/s] 


In [24]:
for bigr in tqdm(bigrams3):
    if (bigr not in bigrams1) and (bigr not in bigrams2) and (bigr not in bigrams4) and (bigr not in bigrams5) and (bigr not in bigrams6):
        unq3.append(bigr)

100%|██████████| 167499/167499 [47:54<00:00, 58.27it/s] 


In [27]:
res_df = pd.DataFrame(unq1, columns =['bigram'])
res_df.to_csv('unique_bigrams_leave_conservative_ref_dummy=0.csv', index=False)

In [29]:
res_df = pd.DataFrame(unq2, columns =['bigram'])
res_df.to_csv('unique_bigrams_leave_conservative_ref_dummy=1.csv', index=False)

In [30]:
res_df = pd.DataFrame(unq3, columns =['bigram'])
res_df.to_csv('unique_bigrams_remain_labour_ref_dummy=0.csv', index=False)

In [32]:
for bigr in tqdm(bigrams4):
    if (bigr not in bigrams1) and (bigr not in bigrams2) and (bigr not in bigrams3) and (bigr not in bigrams5) and (bigr not in bigrams6):
        unq4.append(bigr)

100%|██████████| 247864/247864 [1:03:34<00:00, 64.98it/s]


In [None]:
for bigr in tqdm(bigrams5):
    if (bigr not in bigrams1) and (bigr not in bigrams2) and (bigr not in bigrams3) and (bigr not in bigrams4) and (bigr not in bigrams6):
        unq5.append(bigr)

In [None]:
for bigr in tqdm(bigrams6):
    if (bigr not in bigrams1) and (bigr not in bigrams2) and (bigr not in bigrams3) and (bigr not in bigrams4) and (bigr not in bigrams6):
        unq3.append(bigr)