In [1]:
cd ../src

/Users/williamferreira/Dropbox/mscproject/src


In [2]:
import numpy as np

In [7]:
from model.classifiers.lr_predictors import LogitPredictor, CompoundPredictor
from model.utils import get_dataset, split_data, RunCV, run_test

from model.baseline.transforms import (
    RefutingWordsTransform,
    QuestionMarkTransform,
    HedgingWordsTransform,
    InteractionTransform,
    NegationOfRefutingWordsTransform,
    BoWTransform,
    PolarityTransform,
    BrownClusterPairTransform
)

from model.ext.transforms import (
    AlignedPPDBSemanticTransform,
    NegationAlignmentTransform,
    Word2VecSimilaritySemanticTransform,
    DependencyRootDepthTransform,
    SVOTransform
)

In [8]:
transforms = {
        'BoW': lambda: BoWTransform(),
        'Q': QuestionMarkTransform,
        'W2V': Word2VecSimilaritySemanticTransform,
        'PPDB': AlignedPPDBSemanticTransform,
        'NegAlgn': NegationAlignmentTransform,
        'RootDep': DependencyRootDepthTransform,
        'SVO': SVOTransform
    }

In [9]:
inc_transforms = [
        'Q',
        'BoW',
        'W2V',
        'PPDB',
        'RootDep',
        'NegAlgn',
        'SVO',
        ]

In [10]:
predictor = LogitPredictor

In [11]:
train_data = get_dataset('url-versions-2015-06-14-clean-train.csv')
X, y = split_data(train_data)
test_data = get_dataset('url-versions-2015-06-14-clean-test.csv')

In [12]:
p = predictor([transforms[t] for t in inc_transforms])
test_score = run_test(X, y, test_data, p, display=True)

>> Training classifier <<

>> Classifying test data <<

Confusion matrix:
           for  against  observing
for        219        3         24
against     15       64         12
observing   75       11        101

Measures:
accuracy: 0.7328

Per class:
            accuracy  precision     recall         F1
for        0.7767176  0.7087379  0.8902439  0.7891892
against    0.9217557  0.8205128  0.7032967  0.7573964
observing  0.7671756  0.7372263   0.540107  0.6234568


  if e in SVOTransform._entailment_map.keys() and x == w]


In [13]:
feature_sizes = [transforms[t]().fit(X).transform(X).shape[1] for t in inc_transforms]

In [14]:
boundaries = np.hstack(([0], np.cumsum(feature_sizes)))

In [15]:
feature_boundaries = dict(tuple(zip(zip(boundaries[:-1], boundaries[1:]), inc_transforms)))

In [16]:
feature_boundaries

{(0, 1): 'Q',
 (1, 501): 'BoW',
 (501, 502): 'W2V',
 (502, 503): 'PPDB',
 (503, 505): 'RootDep',
 (505, 506): 'NegAlgn',
 (506, 518): 'SVO'}

In [17]:
def in_range(x, r):
    return r[0] <= x < r[1]

In [18]:
def map_important_features(f):
    d = {}
    for i in f:
        for r, t in feature_boundaries.items():
            if in_range(i, r):
#                 print t, i
                d.setdefault(t, set()).add(i)
    return d

In [19]:
p.classifier.classes_

array(['against', 'for', 'observing'], dtype=object)

In [50]:
np.where(p.classifier.classes_ == 'for')[0][0]

1

In [125]:
important_features = np.where(p.classifier.coef_[1, :] < 0)[0]
map_important_features(important_features).keys()

['Q', 'BoW']

In [52]:
bowt = BoWTransform()
bowt.fit(X).transform(X)

<2071x500 sparse matrix of type '<type 'numpy.int64'>'
	with 15529 stored elements in Compressed Sparse Row format>

In [122]:
def get_bow(label, depth=20):
    idx = np.where(p.classifier.classes_ == label)[0][0]
    bow_idx = np.argsort(p.classifier.coef_[idx, 1:501])[::-1][:depth] - 1
    return np.array(bowt.cv.get_feature_names())[bow_idx]

In [150]:
'Refute' in get_bow('against', 500)

False

In [47]:
map_important_features(important_features)['BoW']

{8,
 12,
 19,
 22,
 28,
 32,
 33,
 35,
 36,
 37,
 39,
 42,
 46,
 50,
 65,
 69,
 70,
 80,
 87,
 88,
 89,
 91,
 102,
 109,
 113,
 116,
 129,
 131,
 133,
 135,
 147,
 148,
 149,
 150,
 152,
 153,
 154,
 173,
 174,
 177,
 179,
 180,
 181,
 194,
 199,
 203,
 204,
 209,
 215,
 219,
 220,
 226,
 229,
 231,
 232,
 236,
 239,
 243,
 252,
 262,
 263,
 265,
 267,
 272,
 275,
 279,
 282,
 283,
 284,
 296,
 300,
 317,
 329,
 332,
 343,
 345,
 346,
 347,
 348,
 349,
 356,
 359,
 361,
 363,
 364,
 381,
 382,
 390,
 396,
 397,
 398,
 400,
 401,
 411,
 415,
 416,
 422,
 425,
 427,
 432,
 438,
 442,
 446,
 455,
 461,
 473,
 474,
 480,
 484}

In [166]:
bowt = BoWTransform()

In [168]:
bowt.fit(X).transform(X)

<2086x500 sparse matrix of type '<type 'numpy.int64'>'
	with 15858 stored elements in Compressed Sparse Row format>

In [170]:
bowt.cv.vocabulary_

{u'000': 0,
 u'10': 1,
 u'100': 2,
 u'12': 3,
 u'12 inch': 4,
 u'2015': 5,
 u'300': 6,
 u'500': 7,
 u'abdel': 8,
 u'about': 9,
 u'abu': 10,
 u'abu bakr': 11,
 u'accidentally': 12,
 u'afghan': 13,
 u'afghan soldiers': 14,
 u'after': 15,
 u'after he': 16,
 u'against': 17,
 u'air': 18,
 u'airport': 19,
 u'airstrike': 20,
 u'airstrikes': 21,
 u'al': 22,
 u'al baghdadi': 23,
 u'all': 24,
 u'alleged': 25,
 u'allegedly': 26,
 u'amazon': 27,
 u'american': 28,
 u'an': 29,
 u'and': 30,
 u'angry': 31,
 u'ankles': 32,
 u'app': 33,
 u'apple': 34,
 u'apple watch': 35,
 u'are': 36,
 u'arrest': 37,
 u'arrested': 38,
 u'as': 39,
 u'at': 40,
 u'at least': 41,
 u'attack': 42,
 u'audio': 43,
 u'baby': 44,
 u'back': 45,
 u'baghdadi': 46,
 u'bakr': 47,
 u'bakr al': 48,
 u'bale': 49,
 u'bank': 50,
 u'bank hank': 51,
 u'banksy': 52,
 u'batmobile': 53,
 u'battery': 54,
 u'battery life': 55,
 u'be': 56,
 u'bear': 57,
 u'bear attack': 58,
 u'beats': 59,
 u'beats music': 60,
 u'because': 61,
 u'been': 62,
 u'befo

In [181]:
sum([len(np.where(p.classifier.coef_[i, :] > 0)[0]) for i in range(3)])

353