In [1]:
import gensim
import sklearn
from sklearn.externals import joblib
from sklearn import svm

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
from sklearn.externals import joblib
import pandas as pd
from sklearn.metrics import accuracy_score

clf = joblib.load('models/svm.pkl') 

test_vectorized = pd.read_csv('lexical_entailment/bless2011/data_lex_test_vectorized.tsv', sep='\t', header=None)
val_vectorized = pd.read_csv('lexical_entailment/bless2011/data_lex_val_vectorized.tsv', sep='\t', header=None)

# Test and validation in one go bc i'm lazy
for test_name, test_df in zip(['test', 'val'], [test_vectorized, val_vectorized]):
	orig_rows, orig_cols = test_df.shape

	# Remove rows with NaN
	test_df.dropna(axis=0, inplace=True)

	# Count number of rows removed
	diff = orig_rows - test_df.shape[0]

	X = test_df.iloc[:, :-1]
	y = test_df.iloc[:, -1]

	preds = clf.predict(X)

	num_correct = accuracy_score(y, preds, normalize=False)

	print test_name, ": percentage non-nan correct:", num_correct/float(test_df.shape[0]) 
	print test_name, ": percentage correct overall", num_correct/float(orig_rows)
    

test : percentage non-nan correct: 0.923381396664
test : percentage correct overall 0.894794520548
val : percentage non-nan correct: 0.931343283582
val : percentage correct overall 0.871508379888


In [85]:
train = pd.read_csv('lexical_entailment/bless2011/data_lex_train_vectorized_asym.tsv', sep='\t', header=None)
train.dropna(axis=0, inplace=True)
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

clf = svm.LinearSVC(class_weight='balanced')
clf.fit(X, y)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [99]:
test_vectorized = pd.read_csv('lexical_entailment/bless2011/data_lex_test_vectorized_asym.tsv', sep='\t', header=None)
test = pd.read_csv('lexical_entailment/bless2011/data_lex_test.tsv', sep='\t', header=None)
val_vectorized = pd.read_csv('lexical_entailment/bless2011/data_lex_val_vectorized_asym.tsv', sep='\t', header=None)

In [107]:
train_df = val_vectorized
test_df = test_vectorized
train_df.dropna(axis=0, inplace=True)
test_df.dropna(axis=0, inplace=True)
X = train_df.iloc[:, :-1]
y = train_df.iloc[:, -1]

In [101]:
preds = clf.predict(X)

In [102]:
print "precision", metrics.precision_score(y, preds)
print "recall", metrics.recall_score(y, preds)
print "f1", metrics.f1_score(y, preds)
print "True", metrics.accuracy_score(y[y == 1], preds[y == 1])
print "False", metrics.accuracy_score(y[y == 0], preds[y == 0])

precision 0.806451612903
recall 0.892857142857
f1 0.847457627119
True 0.892857142857
False 0.980456026059


In [103]:
p_hat = clf.coef_[0][:301]

In [104]:
def normalize(v):
  '''normalize' a vector, in the traditional linear algebra sense.'''
  norm=np.linalg.norm(v)
  if norm==0:
    return v
  return v/norm
def reject(A):

  '''Create a 'projection', and subract it from the original vector'''
  B = p_hat
  project = np.linalg.linalg.dot(A, normalize(B)) * normalize(B)
  return A - project


In [105]:
X_hat = X.apply(reject, axis=1)

In [108]:
clf_hat = svm.LinearSVC()
clf_hat.fit(X_hat, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [109]:
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]
preds_hat = clf_hat.predict(X_test)
print "precision", metrics.precision_score(y_test, preds_hat)
print "recall", metrics.recall_score(y_test, preds_hat)
print "f1", metrics.f1_score(y_test, preds_hat)
print "True", metrics.accuracy_score(y_test[y_test == 1], preds_hat[y_test == 1])
print "False", metrics.accuracy_score(y_test[y_test == 0], preds_hat[y_test == 0])

precision 0.416243654822
recall 0.30258302583
f1 0.350427350427
True 0.30258302583
False 0.964788732394


In [110]:
print "precision", metrics.precision_score(y, preds)
print "recall", metrics.recall_score(y, preds)
print "f1", metrics.f1_score(y, preds)
print "True", metrics.accuracy_score(y[y == 1], preds[y == 1])
print "False", metrics.accuracy_score(y[y == 0], preds[y == 0])

precision 0.806451612903
recall 0.892857142857
f1 0.847457627119
True 0.892857142857
False 0.980456026059


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,-0.213724,0.809331,0.297112,-0.186071,-0.034907,-0.290729,-0.379729,0.036201,0.870842,0.365394,...,-0.188102,0.243477,-0.023277,0.428177,0.360719,-0.180553,-0.337395,-0.246545,-0.150584,5.494806
1,-0.140880,0.407426,-0.406884,-0.099375,0.311697,-0.298782,-0.399137,0.139445,0.195001,-0.006782,...,-0.253005,0.385840,0.278223,0.397531,0.107176,0.043717,0.147003,0.361762,-0.085582,5.279054
2,-0.525884,0.139646,0.303797,0.135053,0.453898,0.158061,-0.203340,-0.205564,-0.047114,0.445377,...,-0.435776,0.308196,-0.319750,0.618561,-0.087400,-0.413050,0.288664,-0.243981,-0.181413,4.619717
3,-0.180898,0.114753,-0.070061,0.252252,0.597400,0.131144,-0.266746,-0.357261,0.092470,0.090985,...,-0.075655,-0.076753,0.226977,0.253251,-0.015521,0.188602,0.337638,0.539969,0.025152,4.216821
4,-0.031170,0.042146,-0.265418,-0.025770,-0.652979,0.441640,-0.002094,0.128319,-0.254889,0.238307,...,0.273346,-0.300729,0.327014,0.160591,-0.282203,0.011332,0.542531,-0.487266,-0.334310,4.639187
5,-0.331152,0.176985,-0.016469,-0.514723,0.298830,-0.347500,-0.378709,-0.224590,0.173618,0.122657,...,0.023464,0.073378,0.380237,0.350616,-0.646019,-0.103318,-0.108761,0.198855,0.228076,4.326398
6,0.196697,-0.174347,-0.035025,0.556092,0.058306,0.006863,0.455478,0.127429,-0.145586,0.009840,...,0.181997,0.119448,-0.004595,0.091617,0.077430,-0.426574,0.133759,0.010318,-0.150641,3.267818
7,-0.036423,0.045991,-0.023890,0.018120,0.004711,0.221178,0.077661,-0.068630,0.069875,0.152835,...,-0.153189,0.254330,0.104943,0.011918,-0.146666,0.260379,-0.035434,-0.081850,-0.238137,2.857145
9,0.210897,-0.080215,0.501304,0.022090,-0.403309,0.163079,0.046596,-0.313518,-0.092768,0.089539,...,-0.316193,-0.040721,-0.472075,0.104228,0.008191,-0.056873,-0.188107,0.309801,-0.144986,4.666057
10,0.037582,0.087906,-0.373136,0.159503,0.078908,-0.139189,-0.190586,0.403099,-0.003747,0.022527,...,-0.070527,-0.275853,-0.356868,0.174104,-0.074302,-0.238589,0.065967,0.044722,0.071920,3.593413
