In [149]:
from gensim.models import Word2Vec

import csv
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer

from collections import defaultdict

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import sys
PREPROCESS_PATH = '../lib/twitter_preprocess'
sys.path.append(PREPROCESS_PATH)
from twitter_preprocess import tokenize

In [113]:
glove_path_25 = 'glove.twitter.27B/glove.twitter.27B.25d.txt'
with open(glove_path_25, 'rb') as lines:
    w2v_25 = {line.split()[0]: np.array(map(float, line.split()[1:])) for line in lines}

In [344]:
glove_path_100 = 'glove.twitter.27B/glove.twitter.27B.100d.txt'
with open(glove_path_100, 'rb') as lines:
    w2v_100 = {line.split()[0]: np.array(map(float, line.split()[1:])) for line in lines}

In [384]:
glove_path_200 = 'glove.twitter.27B/glove.twitter.27B.200d.txt'
with open(glove_path_200, 'rb') as lines:
    w2v_200 = {line.split()[0]: np.array(map(float, line.split()[1:])) for line in lines}

In [151]:
stop_words = set(stopwords.words('english'))

In [394]:
class FeatureExtractor(object):
    
    def __init__(self, train_path, test_path, context_path=None, word2vec=None):
        pass
        
    def load_context(self, context_path):
        file_path = context_path
        delimiter = ';'
        reader = csv.reader(open(file_path,"rb"),delimiter=delimiter)
        texts = []
        for row in reader:
            text = row[4]
            texts.append(text)
        context = texts
        return context
        
    def load_data(self, data_path):
        file_path = data_path
        delimiter = ';'
        reader = csv.reader(open(file_path,"rb"),delimiter=delimiter)
        rows = []
        for row in reader:
            rows.append(row)
        labels  = np.array([int(x[0]) for x in rows])
        data  = [x[1] for x in rows]
        return data, labels
        
    def preprocess(self, data):
        tweets = data
        tokenizer = TweetTokenizer()
        preproc_tweets = map(lambda t: tokenize(t), tweets)
        preproc_tweets = map(lambda t: tokenizer.tokenize(t), preproc_tweets)
        #preproc_tweets = map(lambda t: filter(lambda w: w not in stop_words, t), preproc_tweets)
        return preproc_tweets
    
    def feature_extract(self, preproc_data):
        tweets = preproc_data
        word2vec = self.word2vec
        mean_embeds = np.array([
                        np.mean([word2vec[w] for w in words if w in word2vec]
                                or [np.zeros(dim)], axis=0)
                        for words in tweets
                    ])
        return mean_embeds

In [395]:
class Word2VecFeatureExtractor(FeatureExtractor):
    def __init__(self, train_path, test_path, context_path, size):
        context = self.load_context(context_path)
        preproc_context = self.preprocess(context)
        model = Word2Vec(preproc_context, size=size)
        self.word2vec = dict(zip(model.wv.index2word, model.wv.syn0))
        
        train, train_labels = self.load_data(train_path)
        preproc_train = self.preprocess(train)
        self.train_vecs = self.feature_extract(preproc_train)
        self.train_labels = train_labels
        
        test, test_labels = self.load_data(test_path)
        preproc_test = self.preprocess(test)
        self.test_vecs = self.feature_extract(preproc_test)
        self.test_labels = test_labels

In [396]:
class GloveFeatureExtractor(FeatureExtractor):
    def __init__(self, train_path, test_path, context_path, glove):
        _ = context_path
        self.word2vec = glove
        
        train, train_labels = self.load_data(train_path)
        preproc_train = self.preprocess(train)
        self.train_vecs = self.feature_extract(preproc_train)
        self.train_labels = train_labels
        
        test, test_labels = self.load_data(test_path)
        preproc_test = self.preprocess(test)
        self.test_vecs = self.feature_extract(preproc_test)
        self.test_labels = test_labels

In [397]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [431]:
class Classifier(object):
    
    def __init__(self, fe):
        self.train_vecs = fe.train_vecs
        self.train_labels = fe.train_labels
        self.test_vecs = fe.test_vecs
        self.test_labels = fe.test_labels
        self.clf = None
        
    def sample_data(self, sample_size):
        if not sample_size:
            return
        sample_idxs = np.random.choice(self.train_vecs.shape[0], sample_size, False)
        self.train_vecs = self.train_vecs[sample_idxs]
        self.train_labels = self.train_labels[sample_idxs]
    
    def fit_predict(self):
        train_vecs = self.train_vecs
        train_labels = self.train_labels
        test_vecs = self.test_vecs
        clf = self.clf
        clf.fit(train_vecs, train_labels)
        self.preds = clf.predict(test_vecs)
        
    def evaluate(self):
        test_labels = self.test_labels
        preds = self.preds
        assert(len(test_labels)==len(preds)), "Prediction length differs from test lengths"
        y = test_labels
        tp, fp, tn, fn = 0, 0, 0, 0
        for i in range(len(preds)):
            if preds[i] == 1 and y[i] == 1: tp += 1
            elif preds[i] == 1 and y[i] == 0: fp += 1
            elif preds[i] == 0 and y[i] == 0: tn += 1
            else: fn += 1
        precision = tp / float(tp + fp) if (tp + fp) > 0 else float(tp)
        recall = tp / float(tp + fn) if (tp + fn) > 0 else float(tp)
        f1 = 2 / (1/precision + 1/recall) if precision > 0 and recall > 0 else 0
        print "Precision: {}, Recall: {}, F1: {}".format(precision, recall, f1)
        return precision, recall, f1

In [432]:
class SVMClassifier(Classifier):
    
    def __init__(self, fe, C=0.3):
        self.train_vecs = fe.train_vecs
        self.train_labels = fe.train_labels
        self.test_vecs = fe.test_vecs
        self.test_labels = fe.test_labels
        self.clf = svm.SVC(C=0.3, kernel='linear')

In [433]:
def glove_routine(context_path, train_path, test_path, sample_size=None):
    print "################# GLOVE 25-DIM ##################"
    w2v_25 = w2v
    fe = GloveFeatureExtractor(train_path, test_path, None, w2v_25)
    clf = SVMClassifier(fe, C=0.3)
    clf.sample_data(sample_size)
    clf.fit_predict()
    clf.evaluate()
    print

    print "################# GLOVE 100-DIM ##################"
    fe = GloveFeatureExtractor(train_path, test_path, None, w2v_100)
    clf = SVMClassifier(fe, C=0.3)
    clf.sample_data(sample_size)
    clf.fit_predict()
    clf.evaluate()
    print

    print "################# GLOVE 200-DIM ##################"
    fe = GloveFeatureExtractor(train_path, test_path, None, w2v_200)
    clf = SVMClassifier(fe, C=0.3)
    clf.sample_data(sample_size)
    clf.fit_predict()
    clf.evaluate()
    print

In [434]:
def word2vec_routine(context_path, train_path, test_path, sample_size=None, iters=1):
    dim25, dim100, dim200 = [], [], []
    for i in iters:
        #print "################# W2V 25-DIM ##################"
        fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=25)
        clf = SVMClassifier(fe, C=0.3)
        clf.sample_data(sample_size)
        clf.fit_predict()
        p, r, f1 = clf.evaluate()
        dim25.append((p, r, f1))
        #print

        #print "################# W2V 100-DIM ##################"
        fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=100)
        clf = SVMClassifier(fe, C=0.3)
        clf.sample_data(sample_size)
        clf.fit_predict()
        p, r, f1 = clf.evaluate()
        dim100.append((p, r, f1))
        #print

        #print "################# W2V 200-DIM ##################"
        fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=200)
        clf = SVMClassifier(fe, C=0.3)
        clf.sample_data(sample_size)
        clf.fit_predict()
        p, r, f1 = clf.evaluate()
        dim200.append((p, r, f1))
        #print
    return dim25, dim100, dim200

In [435]:
context_path = 'data/KaepernickStorm.csv'
train_path = 'data/KaepernickTrain.csv'
test_path = 'data/KaepernickTest.csv'

In [436]:
glove_routine(context_path, train_path, test_path)

################# GLOVE 25-DIM ##################
Precision: 0.65274151436, Recall: 0.886524822695, F1: 0.751879699248

################# GLOVE 100-DIM ##################
Precision: 0.778135048232, Recall: 0.858156028369, F1: 0.816188870152

################# GLOVE 200-DIM ##################
Precision: 0.799342105263, Recall: 0.86170212766, F1: 0.829351535836



In [437]:
word2vec_routine(context_path, train_path, test_path)

################# W2V 25-DIM ##################
Precision: 0.78892733564, Recall: 0.808510638298, F1: 0.798598949212

################# W2V 100-DIM ##################
Precision: 0.766990291262, Recall: 0.840425531915, F1: 0.802030456853

################# W2V 200-DIM ##################
Precision: 0.778145695364, Recall: 0.833333333333, F1: 0.804794520548



In [438]:
glove_routine(context_path, train_path, test_path, sample_size=30)

################# GLOVE 25-DIM ##################
Precision: 0.564, Recall: 1.0, F1: 0.721227621483

################# GLOVE 100-DIM ##################
Precision: 0.823529411765, Recall: 0.0496453900709, F1: 0.0936454849498

################# GLOVE 200-DIM ##################
Precision: 0.564, Recall: 1.0, F1: 0.721227621483



In [439]:
word2vec_routine(context_path, train_path, test_path)

################# W2V 25-DIM ##################
Precision: 0.777385159011, Recall: 0.780141843972, F1: 0.778761061947

################# W2V 100-DIM ##################
Precision: 0.76897689769, Recall: 0.826241134752, F1: 0.796581196581

################# W2V 200-DIM ##################
Precision: 0.766666666667, Recall: 0.815602836879, F1: 0.790378006873



Evaluating 100-d Word2Vec Model on UserTrend's Local Month Twitter corpus

In [374]:
fe = Word2VecFeatureExtractor(train_path, test_path, context_path)
clf = SVMClassifier(fe, C=0.3)

In [375]:
clf.fit_predict()
clf.evaluate()

Precision: 0.771241830065, Recall: 0.836879432624, F1: 0.802721088435


Evaluating 25-d Word2Vec Model on UserTrend's Local Month Twitter corpus

In [378]:
fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=25)
clf = SVMClassifier(fe, C=0.3)
clf.fit_predict()
clf.evaluate()

Precision: 0.805653710247, Recall: 0.808510638298, F1: 0.807079646018


In [383]:
fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=200)
clf = SVMClassifier(fe, C=0.3)
clf.fit_predict()
clf.evaluate()

Precision: 0.766666666667, Recall: 0.815602836879, F1: 0.790378006873


Evaluating 25-d GloVe Model on Pre-trained Twitter corpus

In [380]:
w2v_25 = w2v
fe = GloveFeatureExtractor(train_path, test_path, None, w2v_25)
clf = SVMClassifier(fe, C=0.3)
clf.fit_predict()
clf.evaluate()

Precision: 0.65274151436, Recall: 0.886524822695, F1: 0.751879699248


Evaluating 100-d GloVe Model on Pre-trained Twitter corpus

In [382]:
fe = GloveFeatureExtractor(train_path, test_path, None, w2v_100)
clf = SVMClassifier(fe, C=0.3)
clf.fit_predict()
clf.evaluate()

Precision: 0.778135048232, Recall: 0.858156028369, F1: 0.816188870152


Evaluating 100-d GloVe Model on Pre-trained Twitter corpus

In [385]:
fe = GloveFeatureExtractor(train_path, test_path, None, w2v_200)
clf = SVMClassifier(fe, C=0.3)
clf.fit_predict()
clf.evaluate()

Precision: 0.799342105263, Recall: 0.86170212766, F1: 0.829351535836


In [421]:
context_path = 'data/MelaniaTrumpStorm.csv'
train_path = 'data/MelaniaTrumpTrain.csv'
test_path = 'data/MelaniaTrumpTest.csv'

In [422]:
print "################# GLOVE 25-DIM ##################"
w2v_25 = w2v
fe = GloveFeatureExtractor(train_path, test_path, None, w2v_25)
clf = SVMClassifier(fe, C=0.3)
clf.sample_data(30)
clf.fit_predict()
clf.evaluate()
print

print "################# GLOVE 100-DIM ##################"
fe = GloveFeatureExtractor(train_path, test_path, None, w2v_100)
clf = SVMClassifier(fe, C=0.3)
clf.sample_data(30)
clf.fit_predict()
clf.evaluate()
print

print "################# GLOVE 200-DIM ##################"
fe = GloveFeatureExtractor(train_path, test_path, None, w2v_200)
clf = SVMClassifier(fe, C=0.3)
clf.sample_data(30)
clf.fit_predict()
clf.evaluate()
print

################# GLOVE 25-DIM ##################
Precision: 0.767441860465, Recall: 0.673469387755, F1: 0.717391304348

################# GLOVE 100-DIM ##################
Precision: 0.75, Recall: 0.0612244897959, F1: 0.11320754717

################# GLOVE 200-DIM ##################
Precision: 0.59756097561, Recall: 1.0, F1: 0.748091603053



In [423]:
print "################# W2V 25-DIM ##################"
fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=25)
clf = SVMClassifier(fe, C=0.3)
clf.sample_data(30)
clf.fit_predict()
clf.evaluate()
print

print "################# W2V 100-DIM ##################"
fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=100)
clf = SVMClassifier(fe, C=0.3)
clf.sample_data(30)
clf.fit_predict()
clf.evaluate()
print

print "################# W2V 200-DIM ##################"
fe = Word2VecFeatureExtractor(train_path, test_path, context_path, size=200)
clf = SVMClassifier(fe, C=0.3)
clf.sample_data(30)
clf.fit_predict()
clf.evaluate()
print

################# W2V 25-DIM ##################
Precision: 0.713567839196, Recall: 0.965986394558, F1: 0.820809248555

################# W2V 100-DIM ##################
Precision: 0.751479289941, Recall: 0.863945578231, F1: 0.803797468354

################# W2V 200-DIM ##################
Precision: 0.588, Recall: 1.0, F1: 0.740554156171

