In [1]:
import csv
import sys
import numpy as np
from scipy import spatial

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

from ensemble.GiorgosMyrianthous import GiorgosMyrianthous
from ensemble.JiashuPu import JiashuPu
from ensemble.FNCBaseLine import FNCBaseLine
from ensemble.Master import Master
from ensemble.MingjieChen import MingjieChen
from ensemble.XiaoxuanWang import XiaoxuanWang
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from upperbound import compute_ub
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
import pickle

Using TensorFlow backend.


In [2]:
d = DataSet()

folds,hold_out = kfold_split(d,n_folds=2)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

test_dataset = DataSet("test")
d.articles.update(test_dataset.articles)

for stance in test_dataset.stances:
    stance['Stance ID'] += len(d.stances)


test_stances = test_dataset.stances
d.stances.extend(test_stances)

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


In [3]:
master_classifier = None


In [4]:
ids = list(range(len(folds)))
all_folds = np.hstack(tuple([fold_stances[i] for i in ids]))



train = dict()
test = dict()
    
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    train[fold] = np.hstack(tuple([fold_stances[i] for i in ids]))
    test[fold] = fold_stances[fold]

#,XiaoxuanWang,JiashuPu,GiorgosMyrianthous,
slave_classifiers = [FNCBaseLine,MingjieChen]

slv_predicted = dict()
master_train = dict()

In [5]:
print(len(all_folds))
print(len(hold_out_stances))

40350
9622


In [6]:
slv_predicted = dict()
master_train = dict()

In [7]:
for fold in fold_stances:
    slv_predicted[fold] = []
    master_train[fold] = []
    for slv in tqdm(slave_classifiers):
        print("Create classifier" + str(slv))
        cls = slv(d,all_folds)

        print("Preload training data" + str(type(cls)))
        cls.preload_features(d.stances)

        print("Train on fold " + str(fold) + " - " + str(type(cls)))
        cls.train(train[fold])

        slv_predicted[fold].append([LABELS.index(p) for p in cls.predict(test[fold])])
        del cls

    master_train[fold].extend(zip(test[fold], *slv_predicted[fold]))

  0%|          | 0/2 [00:00<?, ?it/s]

Create classifier<class 'ensemble.FNCBaseLine.FNCBaseLine'>
Preload training data<class 'ensemble.FNCBaseLine.FNCBaseLine'>
Train on fold 0 - <class 'ensemble.FNCBaseLine.FNCBaseLine'>
      Iter       Train Loss   Remaining Time 
         1       19440.4406           36.45s
         2       17429.9157           35.10s
         3       15805.5363           35.57s
         4       14471.4253           34.56s
         5       13366.0453           34.46s
         6       12450.3847           34.09s
         7       11676.1519           34.71s
         8       11027.2941           34.11s
         9       10483.4901           33.76s
        10       10016.2165           33.11s
        20        7773.9634           30.27s
        30        7111.8698           28.24s
        40        6829.5898           26.10s
        50        6648.6999           25.16s
        60        6516.1311           24.84s
        70        6419.0088           24.69s
        80        6340.5562           22.41s
    

 50%|█████     | 1/2 [00:36<00:36, 36.38s/it]

Create classifier<class 'ensemble.MingjieChen.MingjieChen'>
207
Embeddings: 208 x 300



  0%|          | 0/75385 [00:00<?, ?it/s][A
  0%|          | 57/75385 [00:00<02:15, 557.56it/s][A
  0%|          | 98/75385 [00:00<02:30, 499.60it/s]

Preload training data<class 'ensemble.MingjieChen.MingjieChen'>


[A
  0%|          | 172/75385 [00:00<02:16, 552.43it/s][A
  0%|          | 231/75385 [00:00<02:13, 562.07it/s][A
  0%|          | 289/75385 [00:00<02:12, 566.90it/s][A
  0%|          | 361/75385 [00:00<02:03, 605.24it/s][A
  1%|          | 428/75385 [00:00<02:00, 620.88it/s][A
  1%|          | 489/75385 [00:00<02:01, 617.30it/s][A
  1%|          | 558/75385 [00:00<01:57, 636.10it/s][A
  1%|          | 621/75385 [00:01<02:00, 621.38it/s][A
  1%|          | 685/75385 [00:01<01:59, 626.24it/s][A
  1%|          | 747/75385 [00:01<02:27, 504.76it/s][A
  1%|          | 802/75385 [00:01<02:25, 513.28it/s][A
  1%|          | 882/75385 [00:01<02:09, 575.04it/s][A
  1%|▏         | 947/75385 [00:01<02:05, 591.89it/s][A
  1%|▏         | 1011/75385 [00:01<02:03, 603.26it/s][A
  1%|▏         | 1099/75385 [00:01<01:51, 664.90it/s][A
  2%|▏         | 1172/75385 [00:01<01:48, 681.40it/s][A
  2%|▏         | 1247/75385 [00:02<01:46, 697.98it/s][A
  2%|▏         | 1319/75385 [00:02<01:46

Train on fold 0 - <class 'ensemble.MingjieChen.MingjieChen'>
training with validation data
Train on 15829 samples, validate on 3958 samples
Epoch 1/100
3s - loss: 0.8458 - acc: 0.7269 - val_loss: 0.8012 - val_acc: 0.7281
Epoch 2/100
2s - loss: 0.8116 - acc: 0.7294 - val_loss: 0.7998 - val_acc: 0.7281
Epoch 3/100
2s - loss: 0.8102 - acc: 0.7294 - val_loss: 0.8000 - val_acc: 0.7281
Epoch 4/100
2s - loss: 0.8105 - acc: 0.7294 - val_loss: 0.7990 - val_acc: 0.7281
Epoch 5/100
2s - loss: 0.8098 - acc: 0.7294 - val_loss: 0.7973 - val_acc: 0.7281
Epoch 6/100
2s - loss: 0.8084 - acc: 0.7294 - val_loss: 0.7972 - val_acc: 0.7281
Epoch 7/100
1s - loss: 0.8076 - acc: 0.7294 - val_loss: 0.7951 - val_acc: 0.7281
Epoch 8/100
2s - loss: 0.8065 - acc: 0.7294 - val_loss: 0.7977 - val_acc: 0.7281
Epoch 9/100
1s - loss: 0.8051 - acc: 0.7294 - val_loss: 0.7922 - val_acc: 0.7281
Epoch 10/100
2s - loss: 0.8043 - acc: 0.7294 - val_loss: 0.7904 - val_acc: 0.7281
Epoch 11/100
2s - loss: 0.8021 - acc: 0.7294 - va

KeyboardInterrupt: 

In [8]:
slaves = []
for slv in tqdm(slave_classifiers):
    print("Training classifier" + str(type(slv)))
    cls = slv(d,all_folds)
    cls.preload_features(d.stances)
    cls.train(all_folds)
    slaves.append(cls)


  0%|          | 0/2 [00:00<?, ?it/s][A

Training classifier<class 'type'>
      Iter       Train Loss   Remaining Time 
         1       39332.1113            2.88m
         2       35206.0548            2.40m
         3       31873.0683            2.22m
         4       29144.4553            2.06m
         5       26891.9285            1.96m
         6       25015.7366            1.87m
         7       23429.3842            1.80m
         8       22110.6335            1.77m
         9       20996.5240            1.72m
        10       20043.8812            1.69m





        20       15575.8171            1.52m
        30       14241.4870            1.40m
        40       13710.3719            1.28m
        50       13404.0874            1.19m
        60       13196.2747            1.14m
        70       13040.1711            1.08m
        80       12912.6287            1.00m
        90       12808.8300           55.07s
       100       12711.9986           49.88s


 50%|█████     | 1/2 [01:40<01:40, 100.57s/it]

       200       12004.0626            0.00s
Training classifier<class 'type'>
207
Embeddings: 208 x 300
training with validation data
Train on 32280 samples, validate on 8070 samples
Epoch 1/100
4s - loss: 0.8047 - acc: 0.7353 - val_loss: 0.8091 - val_acc: 0.7260
Epoch 2/100
3s - loss: 0.7872 - acc: 0.7369 - val_loss: 0.8078 - val_acc: 0.7260
Epoch 3/100
3s - loss: 0.7864 - acc: 0.7369 - val_loss: 0.8094 - val_acc: 0.7260
Epoch 4/100
3s - loss: 0.7855 - acc: 0.7369 - val_loss: 0.8063 - val_acc: 0.7260
Epoch 5/100
3s - loss: 0.7837 - acc: 0.7369 - val_loss: 0.8037 - val_acc: 0.7260
Epoch 6/100
3s - loss: 0.7818 - acc: 0.7369 - val_loss: 0.8001 - val_acc: 0.7260
Epoch 7/100
3s - loss: 0.7811 - acc: 0.7369 - val_loss: 0.8012 - val_acc: 0.7260
Epoch 8/100
3s - loss: 0.7786 - acc: 0.7369 - val_loss: 0.7986 - val_acc: 0.7260
Epoch 9/100
3s - loss: 0.7769 - acc: 0.7369 - val_loss: 0.7927 - val_acc: 0.7260
Epoch 10/100
3s - loss: 0.7772 - acc: 0.7369 - val_loss: 0.7934 - val_acc: 0.7260
Epoch

100%|██████████| 2/2 [09:27<00:00, 210.36s/it]

3s - loss: 0.7201 - acc: 0.7423 - val_loss: 0.7322 - val_acc: 0.7338





In [9]:
def predictxw(data):
    Xs,ys = slaves[1].xys(data)
    prd = []
    for x in Xs:
        if np.all(np.isfinite(x)):
            prd.append(slaves[1].mlpc.predict([x]))
        else:
            prd.append('unrelated')
    return prd

#slaves[1].predict = predictxw


slv_predicted_holdout = []
for slave in tqdm(slaves):
    slv_predicted_holdout.append([LABELS.index(p) for p in slave.predict(hold_out_stances)])

slv_predicted_test = []
for slave in tqdm(slaves):
    slv_predicted_test.append([LABELS.index(p) for p in slave.predict(test_dataset.stances)])

100%|██████████| 2/2 [00:01<00:00,  1.28it/s]
100%|██████████| 2/2 [00:04<00:00,  1.87s/it]


In [10]:

#fold = 1
#mdata = []

#mdata.extend(master_train[fold])


#master = Master(d,mdata)
#master.preload_features(d.stances)
#master.fit(mdata)

#ho_predictions_predictions = master.predict(zip(hold_out_stances,*slv_predicted_holdout))
#report_score(master.xys(hold_out_stances)[1],ho_predictions_predictions)


In [11]:
mdata = []
for fold in fold_stances:
    mdata.extend(master_train[fold])


master = Master(d,mdata)
master.preload_features(d.stances)
master.fit(mdata)

ho_predictions_predictions = master.predict(zip(hold_out_stances,*slv_predicted_holdout))
report_score(master.xys(hold_out_stances)[1],ho_predictions_predictions)


KeyError: 1

In [None]:
final_predictions = master.predict(zip(test_dataset.stances,*slv_predicted_test))

In [None]:
#tmp_slaves = [x for i,x in enumerate(slaves) if i!=1] 

#slv_predicted_holdout = []
#for slave in tqdm(tmp_slaves):
#    slv_predicted_holdout.append([LABELS.index(p) for p in slave.predict(hold_out_stances)])

#slv_predicted_test = []
#for slave in tqdm(tmp_slaves):
#    slv_predicted_test.append([LABELS.index(p) for p in slave.predict(test_dataset.stances)])

mdata = []
fold = 1
mdata.extend(master_train[fold])
master = Master(d,mdata)
master.preload_features(d.stances)
master.fit(mdata)

In [None]:
mdata = []
for fold in fold_stances:
    mdata.extend(master_train[fold])
master = Master(d,mdata)
master.preload_features(d.stances)
master.fit(mdata)

In [None]:
len(slv_predicted_holdout)

In [None]:
for i,stance in enumerate(test_dataset.stances):
    stance['Stance ID'] = len(all_folds)+len(hold_out_stances)+i 


In [None]:
cpstances = list(test_dataset.stances)

for label,stance in zip(final_predictions,cpstances):
    stance['Stance'] = label
    del stance['Stance ID']

f = open('submission.csv', 'w')
f.write("Headline,Body ID,Stance")
w = csv.DictWriter(f, ["Headline","Body ID", "Stance"])
w.writerows(test_dataset.stances)
f.close()

In [None]:
import pandas as pd

# get your prediction done
# Say prediction_results is an array of label strings 

df = pd.read_csv('test_stances_unlabeled.csv')
df['Stance'] = final_predictions
df.to_csv('submission.csv', index=False)

In [None]:
print(1)

In [None]:
master_train[0][0]
print(master_train[0][1])
list(zip(hold_out_stances,*slv_predicted_holdout))[1]

In [None]:
for sprd in slv_predicted_holdout:
    report_score(master.xys(hold_out_stances)[1],[LABELS[i] for i in sprd])

In [None]:
def compute_ub2(slaves,stances):
    actual = []
    for stance in stances:
        actual.append(LABELS.index(stance['Stance']))

    predicted = []
    for classifier in slaves:
        pred = classifier.predict(stances)
        pred = [LABELS.index(p) for p in pred]
        predicted.append(pred)

    oracle = 0
    maxscore=0
    predicted = list(zip(*predicted))
    for i,cls in enumerate(actual):
        
        if cls==3:
            maxscore += 0.25
        else:
            maxscore += 1
            
        #0.25 points if system correctly classifies rel/unrel
        if cls == 3:
            if cls in predicted[i]:
                oracle += 0.25
        elif cls != 3:
            if cls in predicted[i]:
                oracle += 1
            elif 0 in predicted[i] or 1 in predicted[i] or 2 in predicted[i]:
                oracle += 0.25
                
        
    print(oracle)
    print(maxscore)

    print(oracle/maxscore)

compute_ub2(slaves, hold_out_stances)