In [1]:
import csv
import sys
import numpy as np
from scipy import spatial

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

from ensemble.GiorgosMyrianthous import GiorgosMyrianthous
from ensemble.JiashuPu import JiashuPu
from ensemble.FNCBaseLine import FNCBaseLine
from ensemble.Master import Master
from ensemble.MingjieChen import MingjieChen
from ensemble.XiaoxuanWang import XiaoxuanWang
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from upperbound import compute_ub
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
import pickle

Using TensorFlow backend.


In [2]:
d = DataSet()

folds,hold_out = kfold_split(d,n_folds=2)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

test_dataset = DataSet("test")
d.articles.update(test_dataset.articles)

for stance in test_dataset.stances:
    stance['Stance ID'] += len(d.stances)


test_stances = test_dataset.stances
d.stances.extend(test_stances)

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


In [3]:
master_classifier = None


In [5]:
ids = list(range(len(folds)))
all_folds = np.hstack(tuple([fold_stances[i] for i in ids]))

train = dict()
test = dict()
    
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    train[fold] = np.hstack(tuple([fold_stances[i] for i in ids]))
    test[fold] = fold_stances[fold]

#
slave_classifiers = [FNCBaseLine,XiaoxuanWang,JiashuPu,GiorgosMyrianthous,MingjieChen]

slv_predicted = dict()
master_train = dict()

In [None]:
fold = 1
slv_predicted[fold] = []
master_train[fold] = []
for slv in tqdm(slave_classifiers):
    print("Create classifier" + str(slv))
    cls = slv(d,all_folds)

    print("Preload training data" + str(type(cls)))
    cls.preload_features(d.stances)

    print("Train on fold " + str(fold) + " - " + str(type(cls)))
    cls.train(train[fold])

    slv_predicted[fold].append([LABELS.index(p) for p in cls.predict(test[fold])])
    del cls

master_train[fold].extend(zip(test[fold], *slv_predicted[fold]))

  0%|          | 0/5 [00:00<?, ?it/s]

Create classifier<class 'ensemble.FNCBaseLine.FNCBaseLine'>
Preload training data<class 'ensemble.FNCBaseLine.FNCBaseLine'>
Train on fold 1 - <class 'ensemble.FNCBaseLine.FNCBaseLine'>
      Iter       Train Loss   Remaining Time 
         1       19872.6571           27.68s
         2       17742.5424           28.81s
         3       16029.8706           27.56s
         4       14623.0273           27.61s
         5       13463.0553           26.68s
         6       12494.7521           26.63s
         7       11674.7715           26.17s
         8       10991.8624           26.16s
         9       10409.9693           25.68s
        10        9921.8957           25.48s
        20        7580.3190           26.01s
        30        6888.9611           25.06s
        40        6570.4626           23.07s
        50        6393.0022           21.02s
        60        6265.2808           19.52s
        70        6162.0599           17.93s
        80        6061.4682           16.43s
    

 20%|██        | 1/5 [00:27<01:49, 27.32s/it]

Create classifier<class 'ensemble.XiaoxuanWang.XiaoxuanWang'>
Preload training data<class 'ensemble.XiaoxuanWang.XiaoxuanWang'>
Train on fold 1 - <class 'ensemble.XiaoxuanWang.XiaoxuanWang'>


In [7]:
slaves = []
for slv in tqdm(slave_classifiers):
    print("Training classifier" + str(type(slv)))
    cls = slv(d,all_folds)
    cls.preload_features(d.stances)
    cls.train(all_folds)
    slaves.append(cls)

  0%|          | 0/5 [00:00<?, ?it/s]

Training classifier<class 'type'>
      Iter       Train Loss   Remaining Time 
         1       39332.1113            1.04m
         2       35206.0548            1.03m
         3       31873.0683            1.01m
         4       29144.4553            1.00m
         5       26891.9285           59.35s
         6       25015.7366           58.86s
         7       23429.3842           58.55s
         8       22110.6335           58.37s
         9       20996.5240           58.12s
        10       20043.8812           59.59s
        20       15575.8171           54.57s
        30       14241.4870           50.17s
        40       13710.3719           46.38s
        50       13404.0874           42.90s
        60       13196.2747           39.60s
        70       13040.1711           36.57s
        80       12912.6287           33.58s
        90       12808.8300           30.62s
       100       12711.9986           27.73s


 20%|██        | 1/5 [00:59<03:56, 59.05s/it]

       200       12004.0626            0.00s
Training classifier<class 'type'>


 40%|████      | 2/5 [04:03<04:50, 96.80s/it]

Training classifier<class 'type'>


 60%|██████    | 3/5 [11:34<06:45, 202.82s/it]

Training classifier<class 'type'>


 80%|████████  | 4/5 [13:06<02:49, 169.57s/it]

Training classifier<class 'type'>
207
Embeddings: 208 x 300
Epoch 1/100
14s - loss: 0.8052 - acc: 0.7337
Epoch 2/100
3s - loss: 0.7922 - acc: 0.7347
Epoch 3/100
3s - loss: 0.7908 - acc: 0.7347
Epoch 4/100
3s - loss: 0.7893 - acc: 0.7347
Epoch 5/100
3s - loss: 0.7868 - acc: 0.7347
Epoch 6/100
3s - loss: 0.7848 - acc: 0.7347
Epoch 7/100
3s - loss: 0.7820 - acc: 0.7347
Epoch 8/100
3s - loss: 0.7810 - acc: 0.7347
Epoch 9/100
3s - loss: 0.7788 - acc: 0.7347
Epoch 10/100
3s - loss: 0.7781 - acc: 0.7347
Epoch 11/100
3s - loss: 0.7770 - acc: 0.7347
Epoch 12/100
3s - loss: 0.7759 - acc: 0.7347
Epoch 13/100
3s - loss: 0.7752 - acc: 0.7347
Epoch 14/100
3s - loss: 0.7738 - acc: 0.7347
Epoch 15/100
4s - loss: 0.7734 - acc: 0.7347
Epoch 16/100
3s - loss: 0.7736 - acc: 0.7347
Epoch 17/100
3s - loss: 0.7724 - acc: 0.7347
Epoch 18/100
3s - loss: 0.7721 - acc: 0.7347
Epoch 19/100
3s - loss: 0.7718 - acc: 0.7347
Epoch 20/100
3s - loss: 0.7697 - acc: 0.7347
Epoch 21/100
3s - loss: 0.7704 - acc: 0.7347
Epo

100%|██████████| 5/5 [20:11<00:00, 246.42s/it]

3s - loss: 0.7138 - acc: 0.7446





In [9]:
slv_predicted_holdout = []
for slave in tqdm(slaves):
    slv_predicted_holdout.append([LABELS.index(p) for p in slave.predict(hold_out_stances)])

slv_predicted_test = []



for slave in tqdm(slaves):
    slv_predicted_test.append([LABELS.index(p) for p in slave.predict(test_dataset.stances)])


100%|██████████| 5/5 [00:03<00:00,  1.39it/s]
 20%|██        | 1/5 [00:00<00:01,  2.12it/s]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [17]:
tmp_slaves = [x for i,x in enumerate(slaves) if i!=1] 

slv_predicted_holdout = []
for slave in tqdm(tmp_slaves):
    slv_predicted_holdout.append([LABELS.index(p) for p in slave.predict(hold_out_stances)])

slv_predicted_test = []



for slave in tqdm(tmp_slaves):
    slv_predicted_test.append([LABELS.index(p) for p in slave.predict(test_dataset.stances)])


100%|██████████| 4/4 [00:03<00:00,  1.40it/s]
100%|██████████| 4/4 [00:07<00:00,  1.73s/it]


In [24]:
mdata = []
for j in master_train[0]:
    mdata.append([x for i,x in enumerate(j) if i!=2])
print(mdata[0])   

master = Master(d,mdata)
master.preload_features(d.stances)
master.fit(mdata)

[{'Stance': 'unrelated', 'Stance ID': 2, 'Headline': "Christian Bale passes on role of Steve Jobs, actor reportedly felt he wasn't right for part", 'Body ID': 137}, 3, 3, 3, 0]
48


In [39]:
for i,stance in enumerate(test_dataset.stances):
    stance['Stance ID'] = len(all_folds)+len(hold_out_stances)+i 


In [40]:
cpstances = list(test_dataset.stances)

for label,stance in zip(final_predictions,cpstances):
    stance['Stance'] = label
    del stance['Stance ID']

f = open('submission.csv', 'w')
f.write("Headline,Body ID,Stance")
w = csv.DictWriter(f, ["Headline","Body ID", "Stance"])
w.writerows(test_dataset.stances)
f.close()