In [1]:
import numpy as np
np.random.seed(1337) 

import random
random.seed(1337)

import csv
import sys

from scipy import spatial

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook as tqdm

from ensemble.GiorgosMyrianthous import GiorgosMyrianthous
from ensemble.JiashuPu import JiashuPu
from ensemble.FNCBaseLine import FNCBaseLine
from ensemble.Master import Master
from ensemble.MingjieChen import MingjieChen
from ensemble.XiaoxuanWang import XiaoxuanWang
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from upperbound import compute_ub
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
import pickle


Using TensorFlow backend.


In [2]:
d = DataSet()

folds,hold_out = kfold_split(d,n_folds=2)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

test_dataset = DataSet("competition_test")
d.articles.update(test_dataset.articles)

for stance in test_dataset.stances:
    stance['Stance ID'] += len(d.stances)


test_stances = test_dataset.stances
d.stances.extend(test_stances)

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


In [3]:
master_classifier = None


In [4]:
ids = list(range(len(folds)))
all_folds = np.hstack(tuple([fold_stances[i] for i in ids]))



train = dict()
test = dict()
    
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    train[fold] = np.hstack(tuple([fold_stances[i] for i in ids]))
    test[fold] = fold_stances[fold]


slave_classifiers = [XiaoxuanWang,JiashuPu,GiorgosMyrianthous,MingjieChen,FNCBaseLine]

slv_predicted = dict()
master_train = dict()

In [5]:
print(len(all_folds))
print(len(hold_out_stances))

40350
9622


In [None]:
slv_predicted = dict()
master_train = dict()

In [None]:
for fold in fold_stances:
    slv_predicted[fold] = []
    master_train[fold] = []
    for slv in tqdm(slave_classifiers):
        print("Create classifier" + str(slv))
        cls = slv(d,all_folds)

        print("Preload training data" + str(type(cls)))
        cls.preload_features(d.stances)

        print("Train on fold " + str(fold) + " - " + str(type(cls)))
        cls.train(train[fold])

        slv_predicted[fold].append([LABELS.index(p) for p in cls.predict(test[fold])])
        del cls

    master_train[fold].extend(zip(test[fold], *slv_predicted[fold]))

Create classifier<class 'ensemble.XiaoxuanWang.XiaoxuanWang'>


46298it [00:27, 1463.80it/s]

In [None]:
slaves = []
for slv in tqdm(slave_classifiers):
    print("Training classifier" + str(slv))
    cls = slv(d,all_folds)
    cls.preload_features(d.stances)
    cls.train(all_folds)
    slaves.append(cls)

In [None]:
def predictxw(data):
    Xs,ys = slaves[1].xys(data)
    prd = []
    for x in Xs:
        if np.all(np.isfinite(x)):
            prd.append(slaves[1].mlpc.predict([x]))
        else:
            prd.append('unrelated')
    return prd

#slaves[1].predict = predictxw


slv_predicted_holdout = []
for slave in tqdm(slaves):
    slv_predicted_holdout.append([LABELS.index(p) for p in slave.predict(hold_out_stances)])

slv_predicted_test = []
for slave in tqdm(slaves):
    slv_predicted_test.append([LABELS.index(p) for p in slave.predict(test_dataset.stances)])

In [None]:

#fold = 1
#mdata = []

#mdata.extend(master_train[fold])


#master = Master(d,mdata)
#master.preload_features(d.stances)
#master.fit(mdata)

#ho_predictions_predictions = master.predict(zip(hold_out_stances,*slv_predicted_holdout))
#report_score(master.xys(hold_out_stances)[1],ho_predictions_predictions)


In [None]:
mdata = []
for fold in fold_stances:
    mdata.extend(master_train[fold])


master = Master(d,mdata)
master.preload_features(d.stances)
master.fit(mdata)

ho_predictions_predictions = master.predict(zip(hold_out_stances,*slv_predicted_holdout))
report_score(master.xys(hold_out_stances)[1],ho_predictions_predictions)


In [None]:
final_predictions = master.predict(zip(test_dataset.stances,*slv_predicted_test))

In [None]:
#tmp_slaves = [x for i,x in enumerate(slaves) if i!=1] 

#slv_predicted_holdout = []
#for slave in tqdm(tmp_slaves):
#    slv_predicted_holdout.append([LABELS.index(p) for p in slave.predict(hold_out_stances)])

#slv_predicted_test = []
#for slave in tqdm(tmp_slaves):
#    slv_predicted_test.append([LABELS.index(p) for p in slave.predict(test_dataset.stances)])

mdata = []
fold = 1
mdata.extend(master_train[fold])
master = Master(d,mdata)
master.preload_features(d.stances)
master.fit(mdata)

In [None]:
mdata = []
for fold in fold_stances:
    mdata.extend(master_train[fold])
master = Master(d,mdata)
master.preload_features(d.stances)
master.fit(mdata)

In [None]:
len(slv_predicted_holdout)

In [None]:
for i,stance in enumerate(test_dataset.stances):
    stance['Stance ID'] = len(all_folds)+len(hold_out_stances)+i 


In [None]:
cpstances = list(test_dataset.stances)

for label,stance in zip(final_predictions,cpstances):
    stance['Stance'] = label
    del stance['Stance ID']

f = open('submission.csv', 'w')
f.write("Headline,Body ID,Stance")
w = csv.DictWriter(f, ["Headline","Body ID", "Stance"])
w.writerows(test_dataset.stances)
f.close()

In [None]:
import pandas as pd

# get your prediction done
# Say prediction_results is an array of label strings 

df = pd.read_csv('test_stances_unlabeled.csv')
df['Stance'] = final_predictions
df.to_csv('submission.csv', index=False)

In [None]:
print(1)

In [None]:
master_train[0][0]
print(master_train[0][1])
list(zip(hold_out_stances,*slv_predicted_holdout))[1]

In [None]:
for sprd in slv_predicted_holdout:
    report_score(master.xys(hold_out_stances)[1],[LABELS[i] for i in sprd])

In [None]:
def compute_ub2(slaves,stances):
    actual = []
    for stance in stances:
        actual.append(LABELS.index(stance['Stance']))

    predicted = []
    for classifier in slaves:
        pred = classifier.predict(stances)
        pred = [LABELS.index(p) for p in pred]
        predicted.append(pred)

    oracle = 0
    maxscore=0
    predicted = list(zip(*predicted))
    for i,cls in enumerate(actual):
        
        if cls==3:
            maxscore += 0.25
        else:
            maxscore += 1
            
        #0.25 points if system correctly classifies rel/unrel
        if cls == 3:
            if cls in predicted[i]:
                oracle += 0.25
        elif cls != 3:
            if cls in predicted[i]:
                oracle += 1
            elif 0 in predicted[i] or 1 in predicted[i] or 2 in predicted[i]:
                oracle += 0.25
                
        
    print(oracle)
    print(maxscore)

    print(oracle/maxscore)

compute_ub2(slaves, hold_out_stances)