In [10]:
import csv
import re
from gensim.models import Word2Vec
import numpy as np
import pdb
from sklearn.svm import LinearSVC, SVC
from itertools import islice
from sklearn import preprocessing
import os
from constants import SUFFIX_DBPEDIA


In [78]:
tokenized_text = 'sample_data/sample_text.txt'
supersequence_path = tokenized_text + '_superspan_sequence.json'
model_save_path = tokenized_text + '_embedding.bin'
concept_feature_path = tokenized_text + '_econ_feature.txt'
concept_score_path = tokenized_text + '_score_list.bin'


In [22]:
dbpediaFile = tokenized_text + SUFFIX_DBPEDIA

In [23]:
model = Word2Vec.load(model_save_path)

In [24]:
def get_feature(row):
    try:
        if len(row) != 2:
            return
        text = row[1].strip()
        res = re.split('\s+', text[1:-1].strip())
        res = [float(r) for r in res]
        if len(res) == 4:
            return res
    except Exception as e:
        print(e)
        pdb.set_trace()
        
feature_dict = {}
with open(concept_feature_path) as fin:
    for i, line in enumerate(fin):
        row = line.split('\t')
        feature = get_feature(row)
        if feature:
            phrase = row[0]
            feature_dict[phrase] = feature
        else:
            break

In [28]:
dbpediaFile
dbpediaObjs = [json.loads(l) for l in open(dbpediaFile)]

In [33]:
from util.common import flatten
from collections import Counter
dbpedia_phrase_dict = Counter([np['text'] for np in flatten(dbpediaObjs)])


In [61]:
# get distant supervision training labels

SIZE = 1000

neg_phrase = []
start = random.randint(0, len(list(feature_dict.keys()))-SIZE)
end = start + SIZE
for phrase in list(feature_dict.keys())[start: end]:
    if phrase in feature_dict:
        if np.random.random() > 0.5: 
            neg_phrase.append(phrase)
        if len(neg_phrase) > 1000:
            break


pos_phrase = []
for phrase, freq in dbpedia_phrase_dict.items():
        if phrase in feature_dict:
            pos_phrase.append(phrase)
        if len(pos_phrase) > SIZE:
            break

In [65]:
X = []
y = []
for phrase in pos_phrase:
    X.append(feature_dict[phrase])
    y.append(1)
for phrase in neg_phrase:
    X.append(feature_dict[phrase])
    y.append(0)

In [66]:
X_normalized = preprocessing.normalize(X, norm='l2')

In [67]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X)

In [68]:
display(X[:10])
display(X_normalized[:10])
display(X_train_minmax[:10])

[[15.0, 0.59547371, 2.0, -3.0],
 [11.0, 0.55671461, 2.0, -3.0],
 [35.0, 0.62087229, 3.0, -4.0],
 [7.0, 0.55341768, 0.0, -5.0],
 [2.0, 0.53632933, 2.0, -1.0],
 [19.0, 0.56752435, 1.0, -7.0],
 [22.0, 0.55151887, 0.0, -6.0],
 [12.0, 0.59103597, 0.0, -3.0],
 [12.0, 0.54997821, 1.0, -6.0],
 [13.0, 0.54737665, 3.0, -4.0]]

array([[ 0.97158209,  0.03857011,  0.12954428, -0.19431642],
       [ 0.94915824,  0.0480373 ,  0.17257423, -0.25886134],
       [ 0.98979689,  0.01755821,  0.08483973, -0.11311964],
       [ 0.81205474,  0.06420078,  0.        , -0.5800391 ],
       [ 0.65626175,  0.17598621,  0.65626175, -0.32813088],
       [ 0.93683388,  0.02798295,  0.04930705, -0.34514932],
       [ 0.96448178,  0.02417863,  0.        , -0.26304048],
       [ 0.9690369 ,  0.04772797,  0.        , -0.24225922],
       [ 0.89120862,  0.04084544,  0.07426738, -0.44560431],
       [ 0.93262569,  0.03926904,  0.21522131, -0.28696175]])

array([[0.3       , 0.60111303, 0.4       , 0.86956522],
       [0.22      , 0.56198687, 0.4       , 0.86956522],
       [0.7       , 0.62675215, 0.6       , 0.82608696],
       [0.14      , 0.55865872, 0.        , 0.7826087 ],
       [0.04      , 0.54140854, 0.4       , 0.95652174],
       [0.38      , 0.57289898, 0.2       , 0.69565217],
       [0.44      , 0.55674193, 0.        , 0.73913043],
       [0.24      , 0.59663327, 0.        , 0.86956522],
       [0.24      , 0.55518668, 0.2       , 0.73913043],
       [0.26      , 0.55256048, 0.6       , 0.82608696]])

In [70]:
# clf = LinearSVC()
clf = SVC(probability=True, kernel='linear')
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [71]:
print(clf.coef_)
print(clf.intercept_)

[[ 2.22495493e-05 -2.57730508e-04  4.14485162e-05 -1.99984558e+00]]
[-1.00012167]


In [72]:
train_phrase = set(pos_phrase + neg_phrase)

In [73]:
X_test_phrase = []
X_test = []
for phrase, feature in feature_dict.items():
    if phrase not in train_phrase:
        X_test_phrase.append(phrase)
        X_test.append(feature)

In [74]:
y_pred = clf.predict(X_test)
y_score = clf.decision_function(X_test)
y_prob = clf.predict_proba(X_test)

In [79]:
pickle.dump(y_prob[:,1], open(concept_score_path, 'wb'))