In [1]:
import csv
import sys
import numpy as np
from scipy import spatial

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

from ensemble.GiorgosMyrianthous import GiorgosMyrianthous
from ensemble.JiashuPu import JiashuPu
from ensemble.FNCBaseLine import FNCBaseLine
from ensemble.Master import Master
from ensemble.MingjieChen import MingjieChen
from ensemble.XiaoxuanWang import XiaoxuanWang
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from upperbound import compute_ub
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
import pickle

if __name__ == "__main__":
    d = DataSet()
    folds,hold_out = kfold_split(d,n_folds=2)
    fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

    Xs = dict()
    ys = dict()

    master_classifier = None

    train = dict()
    test = dict()

    ids = list(range(len(folds)))
    all_folds = np.hstack(tuple([fold_stances[i] for i in ids]))

    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        train[fold] = np.hstack(tuple([fold_stances[i] for i in ids]))
        test[fold] = fold_stances[fold]

    #
    slave_classifiers = [FNCBaseLine,XiaoxuanWang,JiashuPu,GiorgosMyrianthous,MingjieChen]

    slv_predicted = dict()
    master_train = dict()

    import os

    if not os.path.isfile("features/master_train.pickle"):
        for fold in tqdm(fold_stances):
            slv_predicted[fold] = []
            master_train[fold] = []
            for slv in tqdm(slave_classifiers):
                print("Create classifier" + str(slv))
                cls = slv(d,all_folds)

                print("Preload training data" + str(type(cls)))
                cls.preload_features(d.stances)

                print("Train on fold " + str(fold) + " - " + str(type(cls)))
                cls.train(train[fold])

                slv_predicted[fold].append([LABELS.index(p) for p in cls.predict(test[fold])])
                del cls

            master_train[fold].extend(zip(test[fold], *slv_predicted[fold]))

        pickle.dump(master_train, open("features/master_train.pickle","wb+"))
    else:
        master_train = pickle.load(open("features/master_train.pickle","rb"))

    slaves = []
    if not os.path.isfile("features/slaves.pickle"):
        for slv in tqdm(slave_classifiers):
            print("Training classifier" + str(type(slv)))
            cls = slv(d,all_folds)
            cls.preload_features(d.stances)
            cls.train(all_folds)
            slaves.append(cls)
            cls.delete_big_files()
        pickle.dump(slaves, open("features/slaves.pickle","wb+"))
    else:
        slaves = pickle.load(open("features/slaves.pickle","rb"))


    for slave in slaves:
        print("Loading features for slave " + str(type(slave)))
        slave.preload_features(d.stances)
        slave.load_w2v()


    print("UPPER BOUND:::")
    compute_ub(slaves,hold_out_stances)

    mdata = []
    for fold in fold_stances:
        mdata.extend(master_train[fold])
    master = Master(d,mdata)
    master.preload_features(d.stances)
    master.fit(mdata)

    slv_predicted_holdout = []
    for slave in slaves:
        slv_predicted_holdout.append([LABELS.index(p) for p in slave.predict(hold_out_stances)])

    final_predictions = master.predict(zip(hold_out_stances,*slv_predicted_holdout))
    report_score(master.xys(hold_out_stances)[1],final_predictions)

    test_dataset = DataSet("test")
    d.articles.update(test_dataset.articles)

    for stance in test_dataset.stances:
        stance['Stance ID'] += len(d.stances)


    slv_predicted_test = []
    for slave in slaves:
        slave.dataset.articles.update(test_dataset.articles)
        slave.prepare_final(d,test_dataset,all_folds)
        slave.preload_features(test_dataset.stances,"test.")
        slv_predicted_test.append([LABELS.index(p) for p in slave.predict(test_dataset.stances)])

    final_predictions = master.predict(zip(test_dataset.stances,*slv_predicted_test))

    for label,stance in zip(final_predictions,test_dataset.stances):
        stance['Stance'] = label
        del stance['Stance ID']

    f = open('submission.csv', 'wb')
    w = csv.DictWriter(f, ["Headline","Body ID", "Stance"])
    w.writerows(test_dataset)
    f.close()
    

Using TensorFlow backend.


Reading dataset
Total stances: 49972
Total bodies: 1683
Loading features for slave <class 'ensemble.FNCBaseLine.FNCBaseLine'>
Loading features for slave <class 'ensemble.XiaoxuanWang.XiaoxuanWang'>
Loading features for slave <class 'ensemble.JiashuPu.JiashuPu'>
Loading features for slave <class 'ensemble.GiorgosMyrianthous.GiorgosMyrianthous'>
Loading features for slave <class 'ensemble.MingjieChen.MingjieChen'>
Embeddings: 208 x 300
UPPER BOUND:::
9437
9622
0.9807732280191228
49
49
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    507    |    10     |    219    |    26     |
-------------------------------------------------------------
| disagree  |    33     |    58     |    64     |     7     |
-------------------------------------------------------------
|  discuss  |    135    |    14     |   1586    |    65     |
----------------

25413it [00:29, 863.22it/s]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [2]:
slave.preload_features(test_dataset.stances,"test.")

100%|██████████| 25413/25413 [01:42<00:00, 248.93it/s]


In [18]:
for k in slave.tfidfs.keys():
    if False == np.isfinite(slave.tfidfs[k]):
        print(k)


65654
65894
65997
66152
66376
66567
68205
68387
68630
68736
68802
69135
69829
69942
70248
70430
70434
71592
71668
71686
72077
72521
72600
73156
73305
73367
73753
73818
73983
74238
74418
74824
75003
50023
50110
50409
50437
50713
51000
51179
51241
51329
52218
52362
52709
52730
53038
53632
53811
53943
54313
54544
55065
55584
55774
56541
56622
56641
56676
56954
57058
57202
57410
57809
58631
58650
58964
59114
59177
59194
59358
59399
59421
59519
59537
59568
59786
60054
60155
60377
60393
60860
61140
61271
61983
62967
63121
63397
63730
63830
63999
64112
64269
64521
64538
65090
