In [136]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [137]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [138]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [139]:
vote_file = pd.read_csv('brexit_full_updated3.csv')
vote_file

Unnamed: 0,Constituency ID,full name,PANO,Constituency Name,Party abbreviation,conShare,conShare2,conShare3,bill161,leave,party_old,ref,ref_dummy,party,old_party
0,E14000530,Gerald Howarth,7,Aldershot,Con,50.59,73.403947,73.403947,1.0,1.0,1.0,0.578978,1,1.0,1.0
1,E14000531,Wendy Morton,8,Aldridge-Brownhills,Con,52.05,69.940876,69.940876,1.0,0.0,1.0,0.677963,1,0.0,0.0
2,E14000532,Graham Brady,9,Altrincham and Sale West,Con,52.99,66.503514,66.503514,1.0,1.0,1.0,0.385878,0,1.0,1.0
3,E14000533,Nigel Mills,11,Amber Valley,Con,43.98,55.840528,55.840528,1.0,1.0,1.0,0.652991,1,1.0,1.0
4,E14000534,Nick Herbert,18,Arundel and South Downs,Con,60.79,84.442284,84.442284,,0.0,1.0,0.497011,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,W07000076,Wayne David,114,Caerphilly,Lab,16.59,27.223499,27.223499,1.0,0.0,0.0,0.551360,1,0.0,
656,W07000077,Chris Evans,336,Islwyn,Lab,15.16,23.639482,23.639482,1.0,0.0,0.0,0.589399,1,0.0,
657,W07000078,Alun Cairns,589,Vale Of Glamorgan,Con,46.02,58.527280,58.527280,1.0,0.0,1.0,0.525507,1,0.0,0.0
658,W07000079,Kevin Brennan,129,Cardiff West,Lab,25.15,38.221884,38.221884,0.0,0.0,0.0,0.438226,0,0.0,


In [140]:
def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return t
        
    return topic

In [141]:
politicians_party = {}
politicians_ref = {}
politicians = vote_file['full name'].unique()
for index, row in vote_file.iterrows():
    politicians_party[row['full name']] = row['party']
    politicians_ref[row['full name']] = row['ref_dummy']

In [191]:
df1 = pd.read_csv('2015_commons.csv')
df2 = pd.read_csv('2016_commons.csv')
df = pd.concat([df1, df2])
df = df.drop(['Government'], axis=1)
df = df.loc[(df['Party'] == 'Conservative') | (df['Party'] == 'Labour')]
df = df.loc[df['Name'].isin(politicians)]
df['Date'] = pd.to_datetime(df['Date'])
mask = (df['Date'] > '2015-6-1') & (df['Date'] <= '2016-6-30')
df = df.loc[mask]
df['party2'] = df['Name'].apply(lambda x: politicians_party[x])
df['ref_dummy'] = df['Name'].apply(lambda x: politicians_ref[x])
df['Topic'] = df['Topic'].apply(lambda x: preprocess(x))
# df = df.loc[df['Topic'] == 'Parliament, government and politics']
# df = df.loc[df['Topic'] == 'European Union']
df.head()

Unnamed: 0,Date,Speaker,Name,Party,Topic,Speech,party2,ref_dummy
19224,2015-06-02,Mrs Emma Lewell-Buck (South Shields) (Lab),Emma Lewell-Buck,Labour,Health services and medicine,1. What assessment he has made of recent tren...,0.0,1
19225,2015-06-02,The Secretary of State for Health (Mr Jeremy H...,Jeremy Hunt,Conservative,Health services and medicine,"As you said, Mr Speaker, we shall have those t...",0.0,0
19226,2015-06-02,Mrs Lewell-Buck,Emma Lewell-Buck,Labour,Health services and medicine,I echo the comments made about the late Member...,0.0,1
19227,2015-06-02,Mr Hunt,Jeremy Hunt,Conservative,Health services and medicine,I take responsibility for everything that happ...,0.0,0
19228,2015-06-02,Mr James Gray (North Wiltshire) (Con),James Gray,Conservative,Health services and medicine,I stood against Charles Kennedy in 1992 in Ros...,1.0,1


In [195]:
df1 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 1) & (df['ref_dummy'] == 0)]
df1.to_csv('leave_conservative_ref=0.csv_15-16', index=False)
df1 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 1) & (df['ref_dummy'] == 1)]
df1.to_csv('leave_conservative_ref=1.csv_15-16', index=False)
df1 = df.loc[(df['Party'] == 'Labour') & (df['party2'] == 0) & (df['ref_dummy'] == 0)]
df1.to_csv('remain_labour_ref=0.csv_15-16', index=False)
df1 = df.loc[(df['Party'] == 'Labour') & (df['party2'] == 0) & (df['ref_dummy'] == 1)]
df1.to_csv('remain_labour_ref=1.csv_15-16', index=False)
df1 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 0) & (df['ref_dummy'] == 1)]
df1.to_csv('remain_conservative_ref=1.csv_15-16', index=False)
df1 = df.loc[(df['Party'] == 'Conservative') & (df['party2'] == 0) & (df['ref_dummy'] == 0)]
df1.to_csv('remain_conservative_ref=0.csv_15-16', index=False)

In [185]:
def get_count(df):
    counts = [0, 0, 0, 0, 0, 0, 0, 0]
    for index, row in df.iterrows():
        party = row['Party']
        party2 = row['party2']
#         n_words = len(row['Speech'].split())
        ref_dummy = row['ref_dummy']
        if party == 'Conservative' and party2 == 1 and ref_dummy == 0:
            counts[0] += 1
        if party == 'Conservative' and party2 == 1 and ref_dummy == 1:
            counts[1] += 1
        if party == 'Labour' and party2 == 0 and ref_dummy == 0:
            counts[2] += 1
        if party == 'Labour' and party2 == 0 and ref_dummy == 1:
            counts[3] += 1
        if party == 'Labour' and party2 == 1 and ref_dummy == 0:
            counts[4] += 1
        if party == 'Labour' and party2 == 1 and ref_dummy == 1:
            counts[5] += 1
        if party == 'Conservative' and party2 == 0 and ref_dummy == 0:
            counts[6] += 1
        if party == 'Conservative' and party2 == 0 and ref_dummy == 1:
            counts[7] += 1
    return counts

def get_count_ref_dummy(df):
    counts = [0, 0, 0]
    for index, row in df.iterrows():
        party = row['Party']
        party2 = row['party2']
        ref_dummy = row['ref_dummy']
        if ref_dummy == 1 and party2 == 1:
            counts[0] += 1
        if ref_dummy == 0 and party2 == 0:
            counts[1] += 1
        if ref_dummy == 1 and party2 == 0:
            counts[2] += 1
    return counts

In [189]:
rows = []

counts = get_count(df)
row = ['15-16']
row.extend(counts)
rows.append(row)
df_eu = df.loc[df['Topic'] == 'European Union']
counts = get_count(df_eu)
row = ['15-16 EU']
row.extend(counts)
rows.append(row)
df_parliament = df.loc[df['Topic'] == 'Parliament, government and politics']
counts = get_count(df_parliament)
row = ['15-16 Parliament']
row.extend(counts)
rows.append(row)
cols = ['period', 'leave_conservative_ref=0', 'leave_conservative_ref=1', 
        'remain_labour_ref=0', 'remain_labour_ref=0',
        'leave_labour_ref=0', 'leave_labour_ref=1',
        'remain_conservative_ref=0', 'remain_conservative_ref=1']
res_df = pd.DataFrame(rows, columns=cols)
res_df

Unnamed: 0,period,leave_conservative_ref=0,leave_conservative_ref=1,remain_labour_ref=0,remain_labour_ref=0.1,leave_labour_ref=0,leave_labour_ref=1,remain_conservative_ref=0,remain_conservative_ref=1
0,15-16,2854,10540,6459,10294,156,511,9650,16659
1,15-16 EU,241,664,224,448,32,35,861,806
2,15-16 Parliament,360,762,274,634,9,13,682,1462


In [190]:
res_df.to_csv('leave_conservative_remain_labour_count_15_16.csv')

In [119]:
len(df)

46705

In [120]:
X = []

for index, row in df.iterrows():
    X.append([row['Speech'], row['ref_dummy'], row['Party'] == 'Conservative', row['Party'] == 'Labour'])
        
# for index, row in df.iterrows():
#     party = row['Party']
#     party2 = row['party2']
#     ref_dummy = row['ref_dummy']
#     if party == 'Conservative' and party2 == 1:
#         X.append([row['transcript'], 0])
#     if party == 'Labour' and party2 == 0:
#         X.append([row['transcript'], 1])
# for index, row in df.iterrows():
#     party = row['Party']
#     party2 = row['party2']
#     ref_dummy = row['ref_dummy']
#     if ref_dummy == 1 and party2 == 1:
#         X.append([row['transcript'], 0])
#     if ref_dummy == 0 and party2 == 0:
#         X.append([row['transcript'], 1])
        
train_df = pd.DataFrame(X, columns=['text', 'ref_dummy', 'conservative', 'labour'])
shuffle(train_df, random_state=42)

Unnamed: 0,text,ref_dummy,conservative,labour
2865,Thank you for giving me two bites at the cherr...,1,True,False
8178,I thank the right hon. Gentleman for his suppo...,0,True,False
25729,I would be delighted to have a meeting with my...,1,True,False
45595,I am grateful for the opportunity to raise the...,0,False,True
18507,Does the Secretary of State share my concern t...,1,True,False
...,...,...,...,...
11284,Clear food labelling is vital to show consumer...,1,True,False
44732,The Prime Minister has talked about the nation...,1,True,False
38158,My right hon. Friend has made his point as wel...,1,True,False
860,I certainly take this opportunity to pay tribu...,0,True,False


In [121]:
predictions, raw_outputs = model.predict(train_df['text'].values)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/46705 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1057 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (943 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1984 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (751 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

  0%|          | 0/1704 [00:00<?, ?it/s]

In [122]:
len(raw_outputs)

46705

In [123]:
outputs = [output[0] for output in raw_outputs]
from scipy.special import softmax
outputs = softmax(outputs, axis=1)
outputs

array([[0.99265566, 0.00734434],
       [0.99506356, 0.00493644],
       [0.01151433, 0.98848567],
       ...,
       [0.99671656, 0.00328344],
       [0.0037681 , 0.9962319 ],
       [0.99684632, 0.00315368]])

In [124]:
len(outputs)

46705

In [125]:
import numpy as np
rows = []
class_map ={
    0: 'leave_conservative',
    1: 'remain_labour'
}

for i in range(len(outputs)):
    output = outputs[i]
    row = [class_map[np.argmax(output)]]
    rows.append(row)

train_df['pred'] = rows

In [126]:
train_df['pred'] = train_df['pred'].apply(lambda x: x[0])
train_df['conservative'] = train_df['conservative'].apply(lambda x: 1 if x == True else 0)
train_df['labour'] = train_df['labour'].apply(lambda x: 1 if x == True else 0)

In [127]:
prob_a = []
prob_b = []
for i in range(len(outputs)):
    output = outputs[i]
    prob_a.append(output[0])
    prob_b.append(output[1])

In [128]:
train_df['leave_conservative_prob'] = prob_a
train_df['remain_labour_prob'] = prob_b

In [129]:
train_df

Unnamed: 0,text,ref_dummy,conservative,labour,pred,leave_conservative_prob,remain_labour_prob
0,1. What assessment his Department has made of...,1,0,1,leave_conservative,0.992656,0.007344
1,"Domestic abuse is a devastating crime, and we ...",0,1,0,leave_conservative,0.995064,0.004936
2,The Secretary of State knows how devastating d...,1,0,1,remain_labour,0.011514,0.988486
3,"Yes, it is important that we have specialist s...",0,1,0,leave_conservative,0.995166,0.004834
4,I welcome what the Secretary of State has just...,1,0,1,remain_labour,0.005492,0.994508
...,...,...,...,...,...,...,...
46700,I am very grateful to my hon. Friend for givin...,1,1,0,leave_conservative,0.996561,0.003439
46701,"As so often in this place, my hon. Friend spea...",1,1,0,leave_conservative,0.943702,0.056298
46702,"Let me start by congratulating you, Madam Depu...",1,1,0,leave_conservative,0.996717,0.003283
46703,"As part of that work in the Department, will t...",1,1,0,remain_labour,0.003768,0.996232


In [130]:
train_df.to_csv('leave_conservative_remain_labour_prediction_with_ref_dummy_overall_16-17.csv', index=False)

In [54]:
train_data,test_data = train_test_split(
    train_df,
    test_size=0.1,
    shuffle=False
    )

In [72]:
# model_args = ClassificationArgs(sliding_window=True, overwrite_output_dir=True)

model_args = ClassificationArgs(
    num_train_epochs=1,
    output_dir="ideology_classifier/",
    best_model_dir="ideology_classifier/best-model/",
    evaluate_during_training=True,
    evaluate_during_training_verbose=True,
    evaluate_during_training_steps=50000,
    save_steps=50000,
    save_model_every_epoch=True,
    overwrite_output_dir=True,
    max_seq_length=512,
    sliding_window=True,
    train_batch_size=6,
    eval_batch_size=32,
    regression=False,
    do_lower_case=True,
    #use_early_stopping=True
    )

model = ClassificationModel(
    "distilbert", "ideology_classifier/best-model/", use_cuda=False, args=model_args
)

In [73]:
model

<simpletransformers.classification.classification_model.ClassificationModel at 0x7fd820354e90>

In [57]:
model.train_model(train_data, eval_df=test_data)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/1827 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1172 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1179 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2874 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (808 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 2154 features created from 1827 samples.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:   Starting fine-tuning.


Running Epoch 0 of 1:   0%|          | 0/359 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled
Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 203 features created from 203 samples.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.31733781631445157, 'tp': 67, 'tn': 65, 'fp': 24, 'fn': 47, 'eval_loss': 1.3639598414301872}
INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to ideology_classifier/.


(359,
 {'global_step': [359],
  'tp': [67],
  'tn': [65],
  'fp': [24],
  'fn': [47],
  'mcc': [0.31733781631445157],
  'train_loss': [0.5263945460319519],
  'eval_loss': [1.3639598414301872]})

In [58]:
result, model_outputs, wrong_predictions = model.eval_model(test_data)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/203 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 203 features created from 203 samples.


Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.31733781631445157, 'tp': 67, 'tn': 65, 'fp': 24, 'fn': 47, 'eval_loss': 1.3639598414301872}


In [59]:
outputs = [output[0] for output in model_outputs]
from scipy.special import softmax
outputs = softmax(outputs, axis=1)
outputs

array([[0.99778022, 0.00221978],
       [0.99743944, 0.00256056],
       [0.9971397 , 0.0028603 ],
       [0.9967494 , 0.0032506 ],
       [0.15578832, 0.84421168],
       [0.13858791, 0.86141209],
       [0.00303005, 0.99696995],
       [0.99639697, 0.00360303],
       [0.99194249, 0.00805751],
       [0.00644819, 0.99355181],
       [0.01166924, 0.98833076],
       [0.68988811, 0.31011189],
       [0.99737374, 0.00262626],
       [0.99101687, 0.00898313],
       [0.9976726 , 0.0023274 ],
       [0.99757552, 0.00242448],
       [0.97809916, 0.02190084],
       [0.99714009, 0.00285991],
       [0.99587046, 0.00412954],
       [0.99711738, 0.00288262],
       [0.99778819, 0.00221181],
       [0.99676624, 0.00323376],
       [0.10699691, 0.89300309],
       [0.04208489, 0.95791511],
       [0.00257061, 0.99742939],
       [0.99618181, 0.00381819],
       [0.0111333 , 0.9888667 ],
       [0.00328352, 0.99671648],
       [0.00297119, 0.99702881],
       [0.00745836, 0.99254164],
       [0.

In [60]:
rows = []
class_map ={
    0: 'leave_conservative',
    1: 'remain_labour'
}

for i in range(len(outputs)):
    output = outputs[i]
    row = [test_data.iloc[i]['text'], output[0], output[1], test_data.iloc[i]['labels'], class_map[np.argmax(output)]]
    rows.append(row)

res_df = pd.DataFrame(rows, columns=['text', 'conservative prob', 'labour prob', 'true label', 'pred label'])
res_df['true label'] = res_df['true label'].apply(lambda x: class_map[x])
res_df

Unnamed: 0,text,conservative prob,labour prob,true label,pred label
0,A huge amount of work is being done on these m...,0.997780,0.002220,leave_conservative,leave_conservative
1,I share the right hon. Gentleman’s sentiments ...,0.997439,0.002561,leave_conservative,leave_conservative
2,I can certainly agree with the right hon. Gent...,0.997140,0.002860,leave_conservative,leave_conservative
3,The panel makes reference to certain contacts ...,0.996749,0.003251,leave_conservative,leave_conservative
4,3. What discussions she has had with the part...,0.155788,0.844212,remain_labour,remain_labour
...,...,...,...,...,...
198,T3. The National Citizen Service has been a ...,0.990986,0.009014,leave_conservative,leave_conservative
199,T5. The backward steps in gender inequality ...,0.002983,0.997017,remain_labour,remain_labour
200,T6. The National Citizen Service provides a ...,0.997479,0.002521,leave_conservative,leave_conservative
201,"Given the surge in voter registration, how can...",0.003177,0.996823,remain_labour,remain_labour


In [61]:
res_df.to_csv('leave_conservative_remain_labor_cross_validation.csv', index=False)

In [33]:
test_df = pd.read_csv('test_key_speeches.csv')
X_test = list(test_df['text'].values)

# X_test = []

# for index, row in df.iterrows():
#     party = row['Party']
#     party2 = row['party2']
#     ref_dummy = row['ref_dummy']
#     if party == 'Conservative' and party2 == 0:
#         X_test.append(row['transcript'])

# for index, row in df.iterrows():
#     party = row['Party']
#     party2 = row['party2']
#     ref_dummy = row['ref_dummy']
#     if ref_dummy == 1 and party2 == 0:
#         X_test.append(row['transcript'])
        

In [34]:
predictions, raw_outputs = model.predict(X_test)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/1089 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1100 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1183 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1057 > 512). Running this sequence through the model will result in indexing errors
INFO:simpletransformers.classification.classification_model: 1089 features created from 1089 samples.


  0%|          | 0/35 [00:00<?, ?it/s]

In [35]:
raw_outputs[0]

array([[ 1.8792845 , -1.93901658]])

In [36]:
outputs = [output[0] for output in raw_outputs]
outputs

[array([ 1.8792845 , -1.93901658]),
 array([ 0.76672596, -0.83236724]),
 array([ 1.37795842, -1.44189394]),
 array([ 1.11641192, -1.16569209]),
 array([ 1.68150496, -1.63873565]),
 array([ 0.13467282, -0.10998433]),
 array([ 1.04650772, -1.14303315]),
 array([ 0.7745142 , -0.76827919]),
 array([-0.24872787,  0.12797195]),
 array([-0.14784858,  0.01661806]),
 array([ 0.78036606, -0.73803902]),
 array([ 0.8907423, -0.902978 ]),
 array([ 0.32624602, -0.36390966]),
 array([ 1.38006973, -1.56159747]),
 array([ 1.43829238, -1.48342097]),
 array([ 0.8979845, -0.8805142]),
 array([ 1.31693804, -1.3579911 ]),
 array([ 1.49828446, -1.46070611]),
 array([ 1.48061907, -1.50694788]),
 array([ 1.67389071, -1.69915283]),
 array([ 0.10721713, -0.27489924]),
 array([ 0.30905104, -0.31662196]),
 array([ 0.77476007, -0.78375041]),
 array([ 1.03722262, -1.10815918]),
 array([ 0.31270844, -0.35244063]),
 array([ 1.19421422, -1.28326523]),
 array([ 0.04150686, -0.15028024]),
 array([ 0.68170965, -0.69946873

In [37]:
from scipy.special import softmax
outputs = softmax(outputs, axis=1)

In [38]:
import numpy as np

In [39]:
rows = []
class_map ={
    0: 'leave_conservative',
    1: 'remain_labour'
}
print(len(X_test))
print(len(outputs))
assert(len(X_test) == len(outputs))
for i in range(len(outputs)):
    output = outputs[i]
    row = [X_test[i], output[0], output[1], class_map[np.argmax(output)]]
    rows.append(row)

res_df = pd.DataFrame(rows, columns=['text', 'leave_conservative prob', 'remain_labour prob', 'label'])
res_df

1089
1089


Unnamed: 0,text,leave_conservative prob,remain_labour prob,label
0,We all agree that it is vital that everyone is...,0.978507,0.021493,leave_conservative
1,The IN campaign maintains that there is no pro...,0.831892,0.168108,leave_conservative
2,It is Government policy that Turkey should joi...,0.943739,0.056261,leave_conservative
3,It is also the policy of the European Union th...,0.907384,0.092616,leave_conservative
4,It is also a fact that both the European Union...,0.965117,0.034883,leave_conservative
...,...,...,...,...
1084,This chance may never come again in our lifeti...,0.938729,0.061271,leave_conservative
1085,BAGEHOT: The prime minister is nearing the end...,0.117943,0.882057,remain_labour
1086,BAGEHOT: He’s asking for things he knows he’s ...,0.208246,0.791754,remain_labour
1087,BAGEHOT: You mention the Euro campaign. Are th...,0.191286,0.808714,remain_labour


In [40]:
res_df['label'].value_counts()

leave_conservative    719
remain_labour         370
Name: label, dtype: int64

In [41]:
res_df.to_csv('key_speech_updated_bert.csv')

In [None]:
train_df

In [None]:
train_df['labels'].value_counts()

In [None]:
model.eval_model(train_df)