In [486]:
from gensim.models import Word2Vec

import csv
import numpy as np

import scipy

from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer

from collections import defaultdict

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import sys
PREPROCESS_PATH = '../lib/twitter_preprocess'
sys.path.append(PREPROCESS_PATH)
from twitter_preprocess import tokenize

In [113]:
glove_path_25 = 'glove.twitter.27B/glove.twitter.27B.25d.txt'
with open(glove_path_25, 'rb') as lines:
    w2v_25 = {line.split()[0]: np.array(map(float, line.split()[1:])) for line in lines}

In [344]:
glove_path_100 = 'glove.twitter.27B/glove.twitter.27B.100d.txt'
with open(glove_path_100, 'rb') as lines:
    w2v_100 = {line.split()[0]: np.array(map(float, line.split()[1:])) for line in lines}

In [384]:
glove_path_200 = 'glove.twitter.27B/glove.twitter.27B.200d.txt'
with open(glove_path_200, 'rb') as lines:
    w2v_200 = {line.split()[0]: np.array(map(float, line.split()[1:])) for line in lines}

In [151]:
stop_words = set(stopwords.words('english'))

In [394]:
class FeatureExtractor(object):
    
    def __init__(self, train_path, test_path, context_path=None, word2vec=None):
        pass
        
    def load_context(self, context_path):
        file_path = context_path
        delimiter = ';'
        reader = csv.reader(open(file_path,"rb"),delimiter=delimiter)
        texts = []
        for row in reader:
            text = row[4]
            texts.append(text)
        context = texts
        return context
        
    def load_data(self, data_path):
        file_path = data_path
        delimiter = ';'
        reader = csv.reader(open(file_path,"rb"),delimiter=delimiter)
        rows = []
        for row in reader:
            rows.append(row)
        labels  = np.array([int(x[0]) for x in rows])
        data  = [x[1] for x in rows]
        return data, labels
        
    def preprocess(self, data):
        tweets = data
        tokenizer = TweetTokenizer()
        preproc_tweets = map(lambda t: tokenize(t), tweets)
        preproc_tweets = map(lambda t: tokenizer.tokenize(t), preproc_tweets)
        #preproc_tweets = map(lambda t: filter(lambda w: w not in stop_words, t), preproc_tweets)
        return preproc_tweets
    
    def feature_extract(self, preproc_data):
        tweets = preproc_data
        word2vec = self.word2vec
        mean_embeds = np.array([
                        np.mean([word2vec[w] for w in words if w in word2vec]
                                or [np.zeros(dim)], axis=0)
                        for words in tweets
                    ])
        return mean_embeds

In [395]:
class Word2VecFeatureExtractor(FeatureExtractor):
    def __init__(self, train_path, test_path, context_path, size):
        context = self.load_context(context_path)
        preproc_context = self.preprocess(context)
        model = Word2Vec(preproc_context, size=size)
        self.word2vec = dict(zip(model.wv.index2word, model.wv.syn0))
        
        train, train_labels = self.load_data(train_path)
        preproc_train = self.preprocess(train)
        self.train_vecs = self.feature_extract(preproc_train)
        self.train_labels = train_labels
        
        test, test_labels = self.load_data(test_path)
        preproc_test = self.preprocess(test)
        self.test_vecs = self.feature_extract(preproc_test)
        self.test_labels = test_labels

In [396]:
class GloveFeatureExtractor(FeatureExtractor):
    def __init__(self, train_path, test_path, context_path, glove):
        _ = context_path
        self.word2vec = glove
        
        train, train_labels = self.load_data(train_path)
        preproc_train = self.preprocess(train)
        self.train_vecs = self.feature_extract(preproc_train)
        self.train_labels = train_labels
        
        test, test_labels = self.load_data(test_path)
        preproc_test = self.preprocess(test)
        self.test_vecs = self.feature_extract(preproc_test)
        self.test_labels = test_labels

In [397]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [464]:
class Classifier(object):
    
    def __init__(self, fe):
        self.train_vecs = fe.train_vecs
        self.train_labels = fe.train_labels
        self.test_vecs = fe.test_vecs
        self.test_labels = fe.test_labels
        self.clf = None
        
    def sample_data(self, sample_size):
        if not sample_size:
            return
        sample_idxs = np.random.choice(self.train_vecs.shape[0], sample_size, False)
        self.train_vecs = self.train_vecs[sample_idxs]
        self.train_labels = self.train_labels[sample_idxs]
    
    def fit_predict(self):
        train_vecs = self.train_vecs
        train_labels = self.train_labels
        test_vecs = self.test_vecs
        clf = self.clf
        clf.fit(train_vecs, train_labels)
        self.preds = clf.predict(test_vecs)
        
    def evaluate(self):
        test_labels = self.test_labels
        preds = self.preds
        assert(len(test_labels)==len(preds)), "Prediction length differs from test lengths"
        y = test_labels
        tp, fp, tn, fn = 0, 0, 0, 0
        for i in range(len(preds)):
            if preds[i] == 1 and y[i] == 1: tp += 1
            elif preds[i] == 1 and y[i] == 0: fp += 1
            elif preds[i] == 0 and y[i] == 0: tn += 1
            else: fn += 1
        precision = tp / float(tp + fp) if (tp + fp) > 0 else float(tp)
        recall = tp / float(tp + fn) if (tp + fn) > 0 else float(tp)
        f1 = 2 / (1/precision + 1/recall) if precision > 0 and recall > 0 else 0
        #print "Precision: {}, Recall: {}, F1: {}".format(precision, recall, f1)
        return precision, recall, f1

In [465]:
class SVMClassifier(Classifier):
    
    def __init__(self, fe, C=0.3):
        self.train_vecs = fe.train_vecs
        self.train_labels = fe.train_labels
        self.test_vecs = fe.test_vecs
        self.test_labels = fe.test_labels
        self.clf = svm.SVC(C=0.3, kernel='linear')

In [492]:
def glove_routine(context_path, train_path, test_path, sample_size=None, iters=1):
    
    dim25, dim100, dim200 = [], [], []
    
    #print "################# GLOVE 25-DIM ##################"
    w2v_25 = w2v
    fe25 = GloveFeatureExtractor(train_path, test_path, None, w2v_25)

    #print "################# GLOVE 100-DIM ##################"
    fe100 = GloveFeatureExtractor(train_path, test_path, None, w2v_100)
    
    #print "################# GLOVE 200-DIM ##################"
    fe200 = GloveFeatureExtractor(train_path, test_path, None, w2v_200)
    
    for i in range(iters):
        clf25 = SVMClassifier(fe25, C=0.3)
        clf25.sample_data(sample_size)
        clf25.fit_predict()
        p, r, f1 = clf25.evaluate()
        dim25.append((p, r, f1))
        
        clf100 = SVMClassifier(fe100, C=0.3)
        clf100.sample_data(sample_size)
        clf100.fit_predict()
        p, r, f1 = clf100.evaluate()
        dim100.append((p, r, f1))
        
        clf200 = SVMClassifier(fe200, C=0.3)
        clf200.sample_data(sample_size)
        clf200.fit_predict()
        p, r, f1 = clf200.evaluate()
        dim200.append((p, r, f1))
    
    return dim25, dim100, dim200

In [493]:
def word2vec_routine(context_path, train_path, test_path, sample_size=None, iters=1):
    dim25, dim100, dim200 = [], [], []
    
    #print "################# W2V 25-DIM ##################"
    fe25 = Word2VecFeatureExtractor(train_path, test_path, context_path, size=25)

    #print "################# W2V 100-DIM ##################"
    fe100 = Word2VecFeatureExtractor(train_path, test_path, context_path, size=100)
    
    #print "################# W2V 200-DIM ##################"
    fe200 = Word2VecFeatureExtractor(train_path, test_path, context_path, size=200)
    
    for i in range(iters):
        clf25 = SVMClassifier(fe25, C=0.3)
        clf25.sample_data(sample_size)
        clf25.fit_predict()
        p, r, f1 = clf25.evaluate()
        dim25.append((p, r, f1))
        
        clf100 = SVMClassifier(fe100, C=0.3)
        clf100.sample_data(sample_size)
        clf100.fit_predict()
        p, r, f1 = clf100.evaluate()
        dim100.append((p, r, f1))
        
        clf200 = SVMClassifier(fe200, C=0.3)
        clf200.sample_data(sample_size)
        clf200.fit_predict()
        p, r, f1 = clf200.evaluate()
        dim200.append((p, r, f1))
    
    return dim25, dim100, dim200

In [494]:
context_path = 'data/KaepernickStorm.csv'
train_path = 'data/KaepernickTrain.csv'
test_path = 'data/KaepernickTest.csv'

In [495]:
glove_routine(context_path, train_path, test_path)

([(0.6527415143603134, 0.8865248226950354, 0.7518796992481203)],
 [(0.7781350482315113, 0.8581560283687943, 0.8161888701517708)],
 [(0.7993421052631579, 0.8617021276595744, 0.8293515358361774)])

In [514]:
word2vec_routine(context_path, train_path, test_path)

([(0.7609427609427609, 0.8014184397163121, 0.7806563039723662)],
 [(0.7792642140468228, 0.8262411347517731, 0.802065404475043)],
 [(0.7766666666666666, 0.8262411347517731, 0.8006872852233676)])

In [522]:
dim25, dim100, dim200 = glove_routine(context_path, train_path, test_path, sample_size=30, iters=30)

In [523]:
#print scipy.stats.hmean(dim25, axis=0)
print np.mean(dim25, axis=0)
print np.std(dim25, axis=0) * 2

print
#print scipy.stats.hmean(dim100, axis=0)
print np.mean(dim100, axis=0)
print np.std(dim100, axis=0) * 2

print
#print scipy.stats.hmean(dim200, axis=0)
print np.mean(dim200, axis=0)
print np.std(dim200, axis=0) * 2

[ 0.64145413  0.83922902  0.70336741]
[ 0.14557557  0.39825438  0.14454008]

[ 0.66215591  0.8537415   0.71241291]
[ 0.18644099  0.4483573   0.24083377]

[ 0.64997976  0.92040816  0.75120677]
[ 0.1257234   0.2811678   0.12848133]


In [524]:
dim25, dim100, dim200 = word2vec_routine(context_path, train_path, test_path, sample_size=30, iters=30)

In [525]:
print np.mean(dim25, axis=0)
print np.std(dim25, axis=0) * 2

print
print np.mean(dim100, axis=0)
print np.std(dim100, axis=0) * 2

print
print np.mean(dim200, axis=0)
print np.std(dim200, axis=0) * 2

[ 0.70722512  0.88798186  0.7781283 ]
[ 0.13115082  0.2320046   0.07951363]

[ 0.68789257  0.9154195   0.77451519]
[ 0.11958691  0.27384217  0.1119187 ]

[ 0.6843087   0.90929705  0.77448646]
[ 0.11809007  0.18948017  0.06013639]


In [480]:
print dim200[:5]

[(0.7005649717514124, 0.8794326241134752, 0.779874213836478), (0.6518324607329843, 0.8829787234042553, 0.7499999999999999), (0.6597938144329897, 0.45390070921985815, 0.5378151260504201), (0.7589928057553957, 0.74822695035461, 0.7535714285714286), (0.6150341685649203, 0.9574468085106383, 0.7489597780859918)]


In [521]:
context_path = 'data/MelaniaTrumpStorm.csv'
train_path = 'data/MelaniaTrumpTrain.csv'
test_path = 'data/MelaniaTrumpTest.csv'