In [33]:
# Author: Alex Kolchinski
# Code liberally inspired by and lifted from:
# https://github.com/NLPrinceton/SARC
# https://github.com/cgpotts/cs224u

In [34]:
import os
import csv
import json
from itertools import islice, chain
import nltk
from collections import Counter
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


In [35]:
pol_dir = '../SARC/2.0/pol'
comments_file = os.path.join(pol_dir, 'comments.json')
train_file = os.path.join(pol_dir, 'train-balanced.csv')

In [36]:
with open(comments_file, 'r') as f:
    comments = json.load(f)

In [37]:
for x in islice(comments.items(), 2):
    print(x, '\n')

('7uxqr', {'text': 'Nancyt Pelosi messes up.. 500 Million Jobs lost every month that the economic recovery plan is not passed.. LMAO', 'author': 'Fishbum', 'score': 0, 'ups': 2, 'downs': 4, 'date': '2009-02', 'created_utc': 1233788424, 'subreddit': 'politics'}) 

('7vewt', {'text': 'Netflix CEO: "Please raise my taxes"', 'author': 'jdl2003', 'score': 1733, 'ups': 1985, 'downs': 252, 'date': '2009-02', 'created_utc': 1233940024, 'subreddit': 'politics'}) 



In [38]:
train_ancestors = []
train_responses = []
train_labels = []
lower = True
with open(train_file, 'r') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
        ancestors = row[0].split(' ')
        responses = row[1].split(' ')
        labels = row[2].split(' ')
        if lower:
            train_ancestors.append([comments[r]['text'].lower() for r in ancestors])
            train_responses.append([comments[r]['text'].lower() for r in responses])
        else:
            train_ancestors.append([comments[r]['text'] for r in ancestors])
            train_responses.append([comments[r]['text'] for r in responses])
        train_labels.append(labels)

In [39]:
[(len(x), len(y)) for x,y in zip(train_ancestors, train_responses)][:9]

[(1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (2, 2)]

In [40]:
train_ancestors[8], train_responses[8], train_labels[8]

(['new jersey governor chris christie thinks a family making $6,000 a year is too rich to qualify for medicaid',
  "if you have a kid and the best you can do is a minimum wage job then i think you've made some bad choices in the past"],
 ["yah, at that point they don't even count as people anymore!",
  'do you by any chance know what the term "structural unemployment" means?'],
 ['1', '0'])

In [41]:
from collections import defaultdict
train_vocab = defaultdict(int)
for pair in train_responses:
    for comment in pair:
        for w in nltk.word_tokenize(comment):
            train_vocab[w] += 1
train_vocab = Counter(train_vocab)
print(len(train_vocab))

13631


In [42]:
train_vocab.most_common(5)

[('.', 7226), ('the', 6553), (',', 5269), ('to', 4080), ('a', 3342)]

In [43]:
def unigrams_phi_c(comment):
    return Counter(nltk.word_tokenize(comment))

In [44]:
def concat_phi_r(response_features_pair):
    assert len(response_features_pair) == 2
    #print(response_features_pair[0].shape, response_features_pair[1].shape)
    cat = np.concatenate((response_features_pair[0], response_features_pair[1]))
    return cat
    

In [56]:
#phi_c turns comments into features
#phi_a combines ancestor features into summary
#phi_r combines response features into summary
#Note that this is for the "balanced" framing!
#TODO: Initially ignoring ancestors, include them as another vector later
def build_dataset(ancestors, responses, labels, phi_c, phi_a, phi_r, vectorizer=None, vectorize = True):
    X = []
    Y = []
    feat_dicts = [[],[]]
    N = len(ancestors)
    assert N == len(responses) == len(labels)
    print(N)
    for i in range(N):
        assert len(responses[i]) == 2
        feat_dicts[0].append(phi_c(responses[i][0]))
        feat_dicts[1].append(phi_c(responses[i][1]))
    
        #We only care about the first of the two labels since in the balanced setting
        #they're either 0 1 or 1 0
        Y.append(int(labels[i][0]))
            
    if vectorize:
        # In training, we want a new vectorizer:
        if vectorizer == None:
            vectorizer = DictVectorizer(sparse=False)
            #print(feat_dicts[0][:10], feat_dicts[1][:10])
            feat_matrix = vectorizer.fit_transform(feat_dicts[0] + feat_dicts[1])
        # In assessment, we featurize using the existing vectorizer:
        else:
            feat_matrix = vectorizer.transform(chain(feat_dicts[0], feat_dicts[1]))
        
        response_pair_feats = [feat_matrix[:N], feat_matrix[N:]]
    else:
        response_pair_feats = feat_dicts
        #print(response_pair_feats[0])

    #assert len(feat_matrix == 2*N) 
    #print((feat_matrix[0]), len(feat_matrix[1]))
    
    X = [phi_r((response_pair_feats[0][i], response_pair_feats[1][i])) for i in range(N)]
    #X = list(map(phi_r, response_pair_feats))
    
    return {'X': np.array(X),
            'y': np.array(Y),
            'vectorizer': vectorizer,
            'raw_examples': (ancestors, responses)}

        

In [46]:
responses = train_responses
phi_c = unigrams_phi_c
N = len(responses)
feat_dicts = [[],[]]
for i in range(N):
    assert len(responses[i]) == 2
    feat_dicts[0].append(phi_c(responses[i][0]))
    feat_dicts[1].append(phi_c(responses[i][1]))

In [47]:
unigram_dataset = build_dataset(train_ancestors, train_responses, train_labels, unigrams_phi_c, None, concat_phi_r)

unigram_dataset['X'].shape

6834


(6834, 27262)

In [48]:
def fit_maxent_classifier(X, y):  
    #print(X.shape, y.shape)
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [49]:
def fit_naive_bayes_classifier(X, y):  
    #print(X.shape, y.shape)
    mod = MultinomialNB()
    mod.fit(X, y)
    return mod

In [50]:
def xval_model(model_fit_fn, X, y, folds):
    kf = KFold(folds)
    for train, test in kf.split(X, y):
        model = model_fit_fn(X[train], y[train])
        predictions = model.predict(X[test])
        print(classification_report(y[test], predictions, digits=3))

In [75]:
xval_model(fit_maxent_classifier, unigram_dataset['X'], unigram_dataset['y'], 10)

             precision    recall  f1-score   support

          0      0.738     0.727     0.732       352
          1      0.715     0.726     0.720       332

avg / total      0.727     0.727     0.727       684

             precision    recall  f1-score   support

          0      0.698     0.729     0.713       329
          1      0.738     0.707     0.722       355

avg / total      0.719     0.718     0.718       684

             precision    recall  f1-score   support

          0      0.748     0.757     0.752       345
          1      0.749     0.740     0.745       339

avg / total      0.749     0.749     0.749       684

             precision    recall  f1-score   support

          0      0.747     0.745     0.746       365
          1      0.709     0.712     0.710       319

avg / total      0.730     0.730     0.730       684

             precision    recall  f1-score   support

          0      0.730     0.723     0.726       347
          1      0.717     0.723 

In [None]:
xval_model(fit_naive_bayes_classifier, unigram_dataset['X'], unigram_dataset['y'], 3)

In [52]:
i=0
fasttext_lookup = {}
with open('../../static/wiki-news-300d-1M-subword.vec') as f:
    while True:
        try:
            x = next(f)
        except:
            break
        try:
            fields = x.strip().split()
            idx = fields[0]
            if idx not in train_vocab: continue
            if idx in fasttext_lookup:
                print("Duplicate! ", idx)
            vec = np.array(fields[1:], dtype=np.float32)
            fasttext_lookup[idx] = vec
            i += 1
            if i%500 == 0: print(i)
        except:
            pass


print(len(fasttext_lookup))
print(type(fasttext_lookup['the']), fasttext_lookup['the'].shape, sum(fasttext_lookup['the']))

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
11989
<class 'numpy.ndarray'> (300,) 0.03160001061769435


In [67]:
def embed_phi_c(comment, embeddings):
    words = nltk.word_tokenize(comment)
    unk = np.zeros(next(iter(embeddings.values())).shape)
    return np.sum([embeddings[w] if w in embeddings else unk for w in words], axis=0)

In [54]:
def fasttext_phi_c(comment):
    return embed_phi_c(comment, fasttext_lookup)

In [57]:
fasttext_dataset = build_dataset(
    train_ancestors, train_responses, train_labels, fasttext_phi_c, None, concat_phi_r, None, False)

fasttext_dataset['X'].shape

6834


(6834, 600)

In [61]:
xval_model(fit_maxent_classifier, fasttext_dataset['X'], fasttext_dataset['y'], 3)

             precision    recall  f1-score   support

          0      0.704     0.711     0.707      1147
          1      0.704     0.698     0.701      1131

avg / total      0.704     0.704     0.704      2278

             precision    recall  f1-score   support

          0      0.716     0.702     0.709      1156
          1      0.699     0.713     0.706      1122

avg / total      0.707     0.707     0.707      2278

             precision    recall  f1-score   support

          0      0.698     0.707     0.702      1135
          1      0.705     0.696     0.701      1143

avg / total      0.702     0.701     0.701      2278



In [64]:
%ls ../../static/glove/

[0m[01;32mgen_dot_word2vecs.sh[0m*  [01;32mglove.6B.100d.txt[0m*  [01;32mglove.6B.50d.txt[0m*


In [71]:
i=0
glove_lookup = {}
#with open('../../static/glove/glove.6B.300d.txt') as f:
with open('../../static/') as f:
    while True:
        try:
            x = next(f)
        except:
            break
        try:
            fields = x.strip().split()
            idx = fields[0]
            if idx not in train_vocab: continue
            if idx in glove_lookup:
                print("Duplicate! ", idx)
            vec = np.array(fields[1:], dtype=np.float32)
            glove_lookup[idx] = vec
            i += 1
            if i%500 == 0: print(i)
        except:
            pass


print(len(glove_lookup))
print(type(glove_lookup['the']), glove_lookup['the'].shape, sum(glove_lookup['the']))

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
11821
<class 'numpy.ndarray'> (300,) 3.823568901862018


In [72]:
def glove_phi_c(comment):
    return embed_phi_c(comment, glove_lookup)

glove_dataset = build_dataset(
    train_ancestors, train_responses, train_labels, glove_phi_c, None, concat_phi_r, None, False)

fasttext_dataset['X'].shape

6834


(6834, 600)

In [76]:
xval_model(fit_maxent_classifier, glove_dataset['X'], glove_dataset['y'], 5)

             precision    recall  f1-score   support

          0      0.671     0.677     0.674       681
          1      0.676     0.671     0.673       686

avg / total      0.674     0.674     0.674      1367

             precision    recall  f1-score   support

          0      0.688     0.684     0.686       708
          1      0.663     0.668     0.665       659

avg / total      0.676     0.676     0.676      1367

             precision    recall  f1-score   support

          0      0.677     0.654     0.665       679
          1      0.669     0.692     0.680       688

avg / total      0.673     0.673     0.673      1367

             precision    recall  f1-score   support

          0      0.668     0.659     0.663       680
          1      0.667     0.675     0.671       687

avg / total      0.667     0.667     0.667      1367

             precision    recall  f1-score   support

          0      0.680     0.688     0.684       690
          1      0.678     0.669 