In [14]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import multiprocessing
cores = multiprocessing.cpu_count()
plt.style.use('ggplot')
%matplotlib inline  

In [2]:
# Read in the data, dropping nulls and columns that won't help predictions
df = pd.read_csv('../data/train-balanced-sarcasm.csv')
df.drop(['author','ups','downs','date'], axis=1, inplace=True)
df.dropna(inplace=True)

In [3]:
df.head()

Unnamed: 0,label,comment,subreddit,score,created_utc,parent_comment
0,0,NC and NH.,politics,2,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,nba,-4,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",nfl,3,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,MaddenUltimateTeam,6,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [4]:
# Re-index our dataframe after dropping nulls
df.shape
df.index = range(1010773)

In [12]:
# Train-Test split our data in a 70-30 split, then tokenize the comment and parents comment columns
train, test = train_test_split(df, test_size=0.3, random_state=42)

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['comment']), tags=[r.comment]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['comment']), tags=[r.comment]), axis=1)
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['parent_comment']), tags=[r.parent_comment]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['parent_comment']), tags=[r.parent_comment]), axis=1)

In [15]:
# Build our vocab
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 707541/707541 [00:00<00:00, 3606751.13it/s]


In [16]:
# Train our doc2vec model in gensim with 30 epochs
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 707541/707541 [00:00<00:00, 3421757.12it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3502196.27it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3643572.02it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3586413.07it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3759702.29it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3683555.86it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3330421.53it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3666752.39it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3749337.72it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3677011.01it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3430645.38it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3751432.61it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3583836.07it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3578313.44it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3404356.27it/s]
100%|██████████| 707541/707541 [00:00<00:00, 3464892.19it/s]
100%|██████████| 707541/

CPU times: user 34min 42s, sys: 2min 59s, total: 37min 42s
Wall time: 15min 37s


In [17]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
# Attempt a logistic regression on our data
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

