In [None]:
# Author: Alex Kolchinski
# Code liberally inspired by and lifted from:
# https://github.com/NLPrinceton/SARC
# https://github.com/cgpotts/cs224u

In [181]:
import os
import csv
import json
from itertools import islice, chain
import nltk
from collections import Counter
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


In [8]:
pol_dir = '../SARC/2.0/pol'
comments_file = os.path.join(pol_dir, 'comments.json')
train_file = os.path.join(pol_dir, 'train-balanced.csv')

In [10]:
with open(comments_file, 'r') as f:
    comments = json.load(f)

In [25]:
for x in islice(comments.items(), 2):
    print(x, '\n')

('7uxqr', {'text': 'Nancyt Pelosi messes up.. 500 Million Jobs lost every month that the economic recovery plan is not passed.. LMAO', 'author': 'Fishbum', 'score': 0, 'ups': 2, 'downs': 4, 'date': '2009-02', 'created_utc': 1233788424, 'subreddit': 'politics'}) 

('7vewt', {'text': 'Netflix CEO: "Please raise my taxes"', 'author': 'jdl2003', 'score': 1733, 'ups': 1985, 'downs': 252, 'date': '2009-02', 'created_utc': 1233940024, 'subreddit': 'politics'}) 



In [37]:
train_ancestors = []
train_responses = []
train_labels = []
lower = True
with open(train_file, 'r') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
        ancestors = row[0].split(' ')
        responses = row[1].split(' ')
        labels = row[2].split(' ')
        if lower:
            train_ancestors.append([comments[r]['text'].lower() for r in ancestors])
            train_responses.append([comments[r]['text'].lower() for r in responses])
        else:
            train_ancestors.append([comments[r]['text'] for r in ancestors])
            train_responses.append([comments[r]['text'] for r in responses])
        train_labels.append(labels)

In [66]:
[(len(x), len(y)) for x,y in zip(train_ancestors, train_responses)][:9]

[(1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (2, 2)]

In [77]:
train_ancestors[8], train_responses[8], train_labels[8]

(['new jersey governor chris christie thinks a family making $6,000 a year is too rich to qualify for medicaid',
  "if you have a kid and the best you can do is a minimum wage job then i think you've made some bad choices in the past"],
 ["yah, at that point they don't even count as people anymore!",
  'do you by any chance know what the term "structural unemployment" means?'],
 ['1', '0'])

In [272]:
from collections import defaultdict
train_vocab = defaultdict(int)
for pair in train_responses:
    for comment in pair:
        for w in nltk.word_tokenize(comment):
            train_vocab[w] += 1
len(train_vocab)

13631

In [273]:
train_vocab['oh']

214

In [120]:
def unigrams_phi_c(comment):
    return Counter(nltk.word_tokenize(comment))

In [309]:
def concat_phi_r(response_features_pair):
    print(response_features_pair.shape)
    assert len(response_features_pair) == 2
    #print(list(map(len, response_features_pair)))
    cat = np.concatenate((response_features_pair[0], response_features_pair[1]))
    return cat
    

In [313]:
#phi_c turns comments into features
#phi_a combines ancestor features into summary
#phi_r combines response features into summary
#Note that this is for the "balanced" framing!
#TODO: Initially ignoring ancestors, include them as another vector later
def build_dataset(ancestors, responses, labels, phi_c, phi_a, phi_r, vectorizer=None, vectorize = True):
    X = []
    Y = []
    feat_dicts = [[],[]]
    N = len(ancestors)
    assert N == len(responses) == len(labels)
    print(N)
    for i in range(N):
        assert len(responses[i]) == 2
        feat_dicts[0].append(phi_c(responses[i][0]))
        feat_dicts[1].append(phi_c(responses[i][1]))
    
        #We only care about the first of the two labels since in the balanced setting
        #they're either 0 1 or 1 0
        Y.append(int(labels[i][0]))
        
    print(feat_dicts[0][0])
    
    if vectorize:
        # In training, we want a new vectorizer:
        if vectorizer == None:
            vectorizer = DictVectorizer(sparse=False)
            #print(feat_dicts[0][:10], feat_dicts[1][:10])
            feat_matrix = vectorizer.fit_transform(feat_dicts[0] + feat_dicts[1])
        # In assessment, we featurize using the existing vectorizer:
        else:
            feat_matrix = vectorizer.transform(chain(feat_dicts[0], feat_dicts[1]))
        
        response_pair_feats = zip(feat_matrix[:N], feat_matrix[N:])
    else:
        #print(np.array(feat_dicts).shape)
        #response_pair_feats = list(zip(feat_dicts))
        #print(len(response_pair_feats))
        response_pair_feats = np.transpose(np.array(feat_dicts))
        print(response_pair_feats[0])

    #assert len(feat_matrix == 2*N) 
    #print((feat_matrix[0]), len(feat_matrix[1]))
    
    X = list(map(phi_r, response_pair_feats))
    
    return {'X': np.array(X),
            'y': np.array(Y),
            'vectorizer': vectorizer,
            'raw_examples': (ancestors, responses)}

        

In [None]:
dataset = build_dataset(train_ancestors, train_responses, train_labels, unigrams_phi_c, None, concat_phi_r)

dataset['X'].shape

In [182]:
def fit_maxent_classifier(X, y):  
    #print(X.shape, y.shape)
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [186]:
def fit_naive_bayes_classifier(X, y):  
    #print(X.shape, y.shape)
    mod = MultinomialNB()
    mod.fit(X, y)
    return mod

In [179]:
def xval_model(model_fit_fn, X, y, folds):
    kf = KFold(folds)
    for train, test in kf.split(X, y):
        model = model_fit_fn(X[train], y[train])
        predictions = model.predict(X[test])
        print(classification_report(y[test], predictions, digits=3))

In [231]:
xval_model(fit_maxent_classifier, dataset['X'], dataset['y'], 3)

             precision    recall  f1-score   support

          0      0.723     0.736     0.729      1147
          1      0.727     0.714     0.720      1131

avg / total      0.725     0.725     0.725      2278

             precision    recall  f1-score   support

          0      0.712     0.711     0.711      1156
          1      0.703     0.703     0.703      1122

avg / total      0.707     0.707     0.707      2278

             precision    recall  f1-score   support

          0      0.712     0.718     0.715      1135
          1      0.718     0.711     0.714      1143

avg / total      0.715     0.715     0.715      2278



In [232]:
xval_model(fit_naive_bayes_classifier, dataset['X'], dataset['y'], 3)

             precision    recall  f1-score   support

          0      0.728     0.733     0.730      1147
          1      0.727     0.721     0.724      1131

avg / total      0.727     0.727     0.727      2278

             precision    recall  f1-score   support

          0      0.696     0.716     0.706      1156
          1      0.699     0.677     0.688      1122

avg / total      0.697     0.697     0.697      2278

             precision    recall  f1-score   support

          0      0.710     0.720     0.715      1135
          1      0.718     0.708     0.713      1143

avg / total      0.714     0.714     0.714      2278



In [274]:
fasttext_lookup = {}
with open('../../static/wiki-news-300d-1M-subword.vec') as f:
    while True:
        try:
            x = next(f)
        except:
            break
        try:
            fields = x.strip().split()
            idx = fields[0]
            if idx not in train_vocab: continue
            if i%100 == 0: print(i)
            vec = np.array(fields[1:], dtype=np.float32)
            fasttext_lookup[fields[0]] = vec
        except:
            pass


In [283]:
def embed_phi_c(comment, embeddings, vocab_index):
    words = nltk.word_tokenize(comment)
    return np.sum([embeddings[train_vocab[x]] for x in words])

In [275]:
fasttext_embeds = np.array([fasttext_lookup[x] if x in fasttext_lookup else np.zeros(300) for x in train_vocab])
fasttext_embeds.shape

(13631, 300)

In [282]:
vocab_index = {word : idx for idx,word in train_vocab.items()}
train_vocab['oh']

214

In [259]:
def fasttext_phi_c(comment):
    return embed_phi_c(comment, fasttext_embeds, vocab_index)

In [314]:
dataset = build_dataset(train_ancestors, train_responses, train_labels, fasttext_phi_c, None, concat_phi_r, None, False)

dataset['X'].shape

6834
0.0
[0. 0.]
(2,)


ValueError: zero-dimensional arrays cannot be concatenated

In [None]:
xval_model(fit_maxent_classifier, dataset['X'], dataset['y'], 3)