In [55]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [56]:
import pandas as pd
import logging

In [57]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [58]:
vote_file = pd.read_csv('brexit_full_updated3.csv')
vote_file

Unnamed: 0,Constituency ID,full name,PANO,Constituency Name,Party abbreviation,conShare,conShare2,conShare3,bill161,leave,party_old,ref,ref_dummy,party,old_party
0,E14000530,Gerald Howarth,7,Aldershot,Con,50.59,73.403947,73.403947,1.0,1.0,1.0,0.578978,1,1.0,1.0
1,E14000531,Wendy Morton,8,Aldridge-Brownhills,Con,52.05,69.940876,69.940876,1.0,0.0,1.0,0.677963,1,0.0,0.0
2,E14000532,Graham Brady,9,Altrincham and Sale West,Con,52.99,66.503514,66.503514,1.0,1.0,1.0,0.385878,0,1.0,1.0
3,E14000533,Nigel Mills,11,Amber Valley,Con,43.98,55.840528,55.840528,1.0,1.0,1.0,0.652991,1,1.0,1.0
4,E14000534,Nick Herbert,18,Arundel and South Downs,Con,60.79,84.442284,84.442284,,0.0,1.0,0.497011,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,W07000076,Wayne David,114,Caerphilly,Lab,16.59,27.223499,27.223499,1.0,0.0,0.0,0.551360,1,0.0,
656,W07000077,Chris Evans,336,Islwyn,Lab,15.16,23.639482,23.639482,1.0,0.0,0.0,0.589399,1,0.0,
657,W07000078,Alun Cairns,589,Vale Of Glamorgan,Con,46.02,58.527280,58.527280,1.0,0.0,1.0,0.525507,1,0.0,0.0
658,W07000079,Kevin Brennan,129,Cardiff West,Lab,25.15,38.221884,38.221884,0.0,0.0,0.0,0.438226,0,0.0,


In [59]:
def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return t
        
    return topic

In [60]:
politicians_party = {}
politicians_ref = {}
politicians = vote_file['full name'].unique()
for index, row in vote_file.iterrows():
    politicians_party[row['full name']] = row['party']
    politicians_ref[row['full name']] = row['ref_dummy']

In [77]:
df1 = pd.read_csv('2016_commons.csv')
df2 = pd.read_csv('2017_commons.csv')
df = pd.concat([df1, df2])
df = df.drop(['Government'], axis=1)
df = df.loc[(df['Party'] == 'Conservative') | (df['Party'] == 'Labour')]
df = df.loc[df['Name'].isin(politicians)]
df['Date'] = pd.to_datetime(df['Date'])
mask = (df['Date'] > '2015-6-1') & (df['Date'] <= '2016-6-30')
df = df.loc[mask]
df['party2'] = df['Name'].apply(lambda x: politicians_party[x])
df['ref_dummy'] = df['Name'].apply(lambda x: politicians_ref[x])
df['Topic'] = df['Topic'].apply(lambda x: preprocess(x))
df = df.loc[df['Topic'] == 'Parliament, government and politics']
# df = df.loc[df['Topic'] == 'European Union']
# df = df.loc[df['Topic'] == 'Crime, civil law, justice and rights']
# df = df.loc[df['Topic'] == 'Health services and medicine']
# df = df.loc[df['Topic'] == 'International affairs']
cols = df.columns
cols = ['date', 'speaker', 'name', 'Party', 'Topic', 'transcript', 'party2', 'ref_dummy']
df.columns = cols
df.head()

Unnamed: 0,date,speaker,name,Party,Topic,transcript,party2,ref_dummy
785,2016-01-06,The Secretary of State for Scotland (David Mun...,David Mundell,Conservative,"Parliament, government and politics",May I begin by wishing you a very happy new ye...,0.0,0
787,2016-01-06,David Mundell,David Mundell,Conservative,"Parliament, government and politics",My understanding is that the Deputy First Mini...,0.0,0
789,2016-01-06,David Mundell,David Mundell,Conservative,"Parliament, government and politics","As I said, we are involved in an ongoing negot...",0.0,0
791,2016-01-06,David Mundell,David Mundell,Conservative,"Parliament, government and politics",I think what many people in Scotland will find...,0.0,0
793,2016-01-06,David Mundell,David Mundell,Conservative,"Parliament, government and politics",I am disappointed with the hon. Gentleman’s an...,0.0,0


In [62]:
X = []

for index, row in df.iterrows():
    party = row['Party']
    if party == 'Conservative':
        X.append([row['transcript'], 0])
    if party == 'Labour':
        X.append([row['transcript'], 1])

train_df = pd.DataFrame(X, columns=['text', 'labels'])
shuffle(train_df, random_state=42)

Unnamed: 0,text,labels
184,I am surprised that the Labour party is still ...,0
1018,It will not be possible for law enforcement ag...,0
2909,I could give a variety of responses to those p...,0
2704,"Yes, that is the burden of my speech. As the n...",0
602,"No, I do not have time. Some say that young p...",1
...,...,...
3444,I pay tribute to my hon. Friend for the work h...,0
466,I welcome the Home Secretary’s comments becaus...,1
3092,"I thank the Secretary of State for his answer,...",0
3772,This final group of amendments covers three of...,1


In [63]:
train_data,test_data = train_test_split(
    train_df,
    test_size=0.1,
    shuffle=False
    )

In [64]:
test_data

Unnamed: 0,text,labels
3830,The manufacturing sector makes up roughly one ...,1
3831,"The Government’s position on this is clear, an...",0
3832,I gather that my hon. Friend has been having l...,0
3833,I discussed those matters yesterday with the M...,0
3834,It is clearly good news that manufacturing job...,0
...,...,...
4251,My hon. Friend is right. There were 312 people...,0
4252,"Given the surge in voter registration, how can...",1
4253,"We just had this question a few minutes ago, a...",0
4254,T8. What steps will the Secretary of State t...,1


In [65]:
train_data

Unnamed: 0,text,labels
0,"I beg to move an amendment, at the end of the ...",1
1,"In this House, we generally argue for subsidia...",1
2,As has been discussed during the series of deb...,1
3,I want to ask about devolution within Wales. S...,1
4,I completely agree. That is a fantastic exampl...,1
...,...,...
3825,"For the remaining one and half minutes, I call...",1
3826,"I will be short and to the point, Madam Deputy...",0
3827,I thank the hon. Gentleman for giving advance ...,1
3828,1. What steps the Government is taking to supp...,1


In [66]:
# model_args = ClassificationArgs(sliding_window=True, overwrite_output_dir=True)

model_args = ClassificationArgs(
    num_train_epochs=1,
    output_dir="ideology_classifier/",
    best_model_dir="ideology_classifier/best-model/",
    evaluate_during_training=True,
    evaluate_during_training_verbose=True,
    evaluate_during_training_steps=50000,
    save_steps=50000,
    save_model_every_epoch=True,
    overwrite_output_dir=True,
    max_seq_length=512,
    sliding_window=True,
    train_batch_size=6,
    eval_batch_size=32,
    regression=False,
    do_lower_case=True,
    #use_early_stopping=True
    )

model = ClassificationModel(
    "distilbert", "ideology_classifier/best-model/", use_cuda=False, args=model_args
)

In [67]:
model

<simpletransformers.classification.classification_model.ClassificationModel at 0x7fd246c8bad0>

In [68]:
model.train_model(train_data, eval_df=test_data)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/3830 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1172 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1300 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (727 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1087 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (872 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:   Starting fine-tuning.


Running Epoch 0 of 1:   0%|          | 0/735 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled
Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 426 features created from 426 samples.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.4372657646793731, 'tp': 57, 'tn': 279, 'fp': 27, 'fn': 63, 'eval_loss': 1.1386356472969055}
INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to ideology_classifier/.


(735,
 {'global_step': [735],
  'tp': [57],
  'tn': [279],
  'fp': [27],
  'fn': [63],
  'mcc': [0.4372657646793731],
  'train_loss': [0.00842174980789423],
  'eval_loss': [1.1386356472969055]})

In [38]:
result, model_outputs, wrong_predictions = model.eval_model(test_data)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/426 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 426 features created from 426 samples.


Running Evaluation:   0%|          | 0/15 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.46509850828420984, 'tp': 60, 'tn': 280, 'fp': 26, 'fn': 60, 'eval_loss': 0.6931240439414978}


In [41]:
outputs = [output[0] for output in model_outputs]
from scipy.special import softmax
outputs = softmax(outputs, axis=1)
outputs

array([[0.02373769, 0.97626231],
       [0.99591258, 0.00408742],
       [0.99651755, 0.00348245],
       [0.99596255, 0.00403745],
       [0.99496831, 0.00503169],
       [0.99576332, 0.00423668],
       [0.99596898, 0.00403102],
       [0.99579803, 0.00420197],
       [0.0193564 , 0.9806436 ],
       [0.99524769, 0.00475231],
       [0.18518253, 0.81481747],
       [0.99628506, 0.00371494],
       [0.96612479, 0.03387521],
       [0.99582053, 0.00417947],
       [0.32503453, 0.67496547],
       [0.99658524, 0.00341476],
       [0.99653377, 0.00346623],
       [0.99614366, 0.00385634],
       [0.99448074, 0.00551926],
       [0.66250905, 0.33749095],
       [0.65974021, 0.34025979],
       [0.9945428 , 0.0054572 ],
       [0.02003702, 0.97996298],
       [0.99520794, 0.00479206],
       [0.9789458 , 0.0210542 ],
       [0.99516114, 0.00483886],
       [0.8598    , 0.1402    ],
       [0.99548351, 0.00451649],
       [0.9959705 , 0.0040295 ],
       [0.99327388, 0.00672612],
       [0.

In [42]:
import numpy as np

In [53]:
rows = []
class_map ={
    0: 'conservative',
    1: 'labour'
}

for i in range(len(outputs)):
    output = outputs[i]
    row = [test_data.iloc[i]['text'], output[0], output[1], test_data.iloc[i]['labels'], class_map[np.argmax(output)]]
    rows.append(row)

res_df = pd.DataFrame(rows, columns=['text', 'conservative prob', 'labour prob', 'true label', 'pred label'])
res_df['true label'] = res_df['true label'].apply(lambda x: class_map[x])
res_df

Unnamed: 0,text,conservative prob,labour prob,true label,pred label
0,The manufacturing sector makes up roughly one ...,0.023738,0.976262,labour,labour
1,"The Government’s position on this is clear, an...",0.995913,0.004087,conservative,conservative
2,I gather that my hon. Friend has been having l...,0.996518,0.003482,conservative,conservative
3,I discussed those matters yesterday with the M...,0.995963,0.004037,conservative,conservative
4,It is clearly good news that manufacturing job...,0.994968,0.005032,conservative,conservative
...,...,...,...,...,...
421,My hon. Friend is right. There were 312 people...,0.995945,0.004055,conservative,conservative
422,"Given the surge in voter registration, how can...",0.027452,0.972548,labour,labour
423,"We just had this question a few minutes ago, a...",0.995272,0.004728,conservative,conservative
424,T8. What steps will the Secretary of State t...,0.054247,0.945753,labour,labour


In [54]:
res_df.to_csv('labour_conservative_cross_validation.csv', index=False)

In [78]:
X = []

for index, row in df.iterrows():
    party = row['Party']
    party2 = row['party2']
    if party == 'Conservative' and party2 == 0:
        X.append(row['transcript'])

In [79]:
predictions, raw_outputs = model.predict(X)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/1409 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (872 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (660 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 1409 features created from 1409 samples.


  0%|          | 0/50 [00:00<?, ?it/s]

In [80]:
outputs = [output[0] for output in raw_outputs]
from scipy.special import softmax
outputs = softmax(outputs, axis=1)
import numpy as np

In [81]:
rows = []
class_map ={
    0: 'conservative',
    1: 'labour'
}
for i in range(len(outputs)):
    output = outputs[i]
    row = [X[i], output[0], output[1], class_map[np.argmax(output)]]
    rows.append(row)

res_df = pd.DataFrame(rows, columns=['text', 'conservative prob', 'labour prob', 'label'])
res_df

Unnamed: 0,text,conservative prob,labour prob,label
0,May I begin by wishing you a very happy new ye...,0.999448,0.000552,conservative
1,My understanding is that the Deputy First Mini...,0.999496,0.000504,conservative
2,"As I said, we are involved in an ongoing negot...",0.999479,0.000521,conservative
3,I think what many people in Scotland will find...,0.999146,0.000854,conservative
4,I am disappointed with the hon. Gentleman’s an...,0.999090,0.000910,conservative
...,...,...,...,...
1404,"Cyber-security is incredibly important, especi...",0.999459,0.000541,conservative
1405,"As I said to the House a few moments ago, we w...",0.999238,0.000762,conservative
1406,My hon. Friend is right. There were 312 people...,0.999448,0.000552,conservative
1407,"We just had this question a few minutes ago, a...",0.999339,0.000661,conservative


In [82]:
res_df.to_csv('prediction_on_remain_conservative_16_17.csv', index=False)

In [13]:
test_df = pd.read_csv('test_key_speeches.csv')
X_test = list(test_df['text'].values)

In [14]:
predictions, raw_outputs = model.predict(X_test)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/1089 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1100 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1183 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1057 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 1089 features created from 1089 samples.


  0%|          | 0/35 [00:00<?, ?it/s]

In [15]:
raw_outputs[0]

array([[ 1.95227098, -1.99213946]])

In [16]:
outputs = [output[0] for output in raw_outputs]
outputs

[array([ 1.95227098, -1.99213946]),
 array([ 1.24871838, -1.19846106]),
 array([ 1.78973699, -1.80170774]),
 array([ 1.60332608, -1.57185733]),
 array([ 1.85678375, -1.79256666]),
 array([ 1.10241163, -1.01736414]),
 array([ 1.55285192, -1.54148531]),
 array([ 1.17348564, -1.15665245]),
 array([ 0.66505677, -0.65435606]),
 array([ 0.42139491, -0.47842768]),
 array([ 1.56490052, -1.51697445]),
 array([ 0.57299167, -0.5326702 ]),
 array([ 1.5603143 , -1.56902778]),
 array([ 1.81928408, -1.92956221]),
 array([ 1.75548112, -1.83512855]),
 array([ 1.74382436, -1.78878212]),
 array([ 1.69540989, -1.72577441]),
 array([ 1.69235003, -1.67261875]),
 array([ 1.72138274, -1.80469108]),
 array([ 1.8562814, -1.9010911]),
 array([ 1.40714908, -1.54910219]),
 array([ 1.02142251, -1.0240922 ]),
 array([ 1.41927958, -1.37062562]),
 array([ 1.60770154, -1.70911896]),
 array([ 1.51045501, -1.50061989]),
 array([ 1.65957725, -1.75072443]),
 array([ 1.52237165, -1.56905448]),
 array([ 1.41017354, -1.420876

In [17]:
from scipy.special import softmax
outputs = softmax(outputs, axis=1)

In [18]:
import numpy as np

In [19]:
rows = []
class_map ={
    0: 'conservative',
    1: 'labour'
}
print(len(X_test))
print(len(outputs))
assert(len(X_test) == len(outputs))
for i in range(len(outputs)):
    output = outputs[i]
    row = [X_test[i], output[0], output[1], class_map[np.argmax(output)]]
    rows.append(row)

res_df = pd.DataFrame(rows, columns=['text', 'conservative prob', 'labour prob', 'label'])
res_df

1089
1089


Unnamed: 0,text,conservative prob,labour prob,label
0,We all agree that it is vital that everyone is...,0.981005,0.018995,conservative
1,The IN campaign maintains that there is no pro...,0.920355,0.079645,conservative
2,It is Government policy that Turkey should joi...,0.973181,0.026819,conservative
3,It is also the policy of the European Union th...,0.959890,0.040110,conservative
4,It is also a fact that both the European Union...,0.974651,0.025349,conservative
...,...,...,...,...
1084,This chance may never come again in our lifeti...,0.969630,0.030370,conservative
1085,BAGEHOT: The prime minister is nearing the end...,0.215166,0.784834,labour
1086,BAGEHOT: He’s asking for things he knows he’s ...,0.323776,0.676224,labour
1087,BAGEHOT: You mention the Euro campaign. Are th...,0.278107,0.721893,labour


In [20]:
res_df['label'].value_counts()

conservative    1061
labour            28
Name: label, dtype: int64

In [21]:
res_df.to_csv('key_speeches_labour_conservative_prediction_updated_bert.csv', index=False)

In [22]:
train_df

NameError: name 'train_df' is not defined

In [None]:
train_df['labels'].value_counts()

In [None]:
model.eval_model(train_df)