In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import torch
import os
import json
import re
import random
from spacy.util import minibatch
from torch import nn
from pytorch_transformers import *
from sklearn.model_selection import train_test_split 
from datetime import datetime as dt
from dateutil import tz

In [2]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load('en_pytt_robertabase_lg')
print(nlp.pipe_names) # ["sentencizer", "pytt_wordpiecer", "pytt_tok2vec"]

['sentencizer', 'pytt_wordpiecer', 'pytt_tok2vec']


In [None]:
nltk.download('punkt')
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')

@inproceedings{ott2019fairseq,
  title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
  author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli},
  booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations},
  year = {2019},
}


In [None]:
csv_path = 'dadjokes-subreddit-archive/data_dadjokes.csv'
jokes_path = 'dadjokes-subreddit-archive/data_jokes.csv'

jokes = pd.read_csv(jokes_path, na_values=["[deleted]", "",'N/A'], index_col=0, sep='|')
dj = pd.read_csv(csv_path, na_values=["[deleted]", "",'N/A'], index_col=0, sep='|')
dj.head()

In [None]:
jokes.head()

In [None]:
print(dj.isna().sum())
print(jokes.isna().sum())

In [None]:
def fill_na(df):
    df.title.fillna('', inplace=True)
    df.selftext.fillna('', inplace=True)
    return df

dj = fill_na(dj)
jokes = fill_na(jokes)
print(dj.isna().sum())
print(jokes.isna().sum())

In [None]:
plt.scatter(dj.score, dj.num_comments);
plt.xlabel('Score')
plt.ylabel('Total Comments')
plt.title('Score vs. Number of Comments on\nSubmissions in /r/dadjokes')
plt.show()

plt.scatter(jokes.score, jokes.num_comments);
plt.xlabel('Score')
plt.ylabel('Total Comments')
plt.title('Score vs. Number of Comments on\nSubmissions in /r/jokes')
plt.show()

In [None]:
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Total Comments')
plt.ylabel('Number of Submissions')
plt.title('Submissions by Number of Comments\nin /r/dadjokes')
plt.hist(dj.num_comments, bins=[0,5,20,100,500,1500]);
plt.show()

plt.xscale('log')
plt.yscale('log')
plt.xlabel('Total Comments')
plt.ylabel('Number of Submissions')
plt.title('Submissions by Number of Comments\nin /r/jokes')
plt.hist(jokes.num_comments, bins=[0,5,20,50,100,500,1500]);
plt.show()

In [None]:
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Score')
plt.ylabel('Number of Submissions')
plt.title('Submissions by Score (Upvotes) in /r/dadjokes')
plt.hist(dj.score, bins=[0,5,20,100,500,40000]);
plt.show()

plt.xscale('log')
plt.yscale('log')
plt.xlabel('Score')
plt.ylabel('Number of Submissions')
plt.title('Submissions by Score (Upvotes) in /r/jokes')
plt.hist(jokes.score, bins=[0,5,20,100,500,40000]);
plt.show()

In [None]:
jokes['is_crosspost'] = ~jokes['parent_createdUTC'].isna()
jokes.groupby(by='is_crosspost').mean()    

In [None]:
dj['is_crosspost'] = ~dj['parent_createdUTC'].isna()
dj.groupby(by='is_crosspost').mean()    

##### Crossposts receive significantly lower scores, num_comments than original posts in the dadjokes subreddit. This may be because viewers are easily redirected to the parent post to comments or upvote in other areas of reddit. However, the contents of the crossposted jokes (title and selftext) are pulled from the parent posts. Therefore crossposts will be analyzed separately when attempting to predict scores, num_comments.

In [None]:
main_dj = dj[~dj.is_crosspost].drop(columns=['parent_createdUTC', 'is_crosspost'])
print(main_dj.shape)

main_jokes = jokes[~jokes.is_crosspost].drop(columns=['parent_createdUTC', 'is_crosspost'])
print(main_jokes.shape)

In [None]:
train_dj, test_dj = train_test_split(main_dj, test_size=0.2, random_state=42)
print(f"Training sample size: {len(train_dj)}\nTesting sample size: {len(test_dj)}")

train_jokes, test_jokes = train_test_split(main_jokes, test_size=0.2, random_state=42)
print(f"Training sample size: {len(train_jokes)}\nTesting sample size: {len(test_jokes)}")

In [None]:
def cat_num(val):
    if val <= 1:
        return 0
    elif val <= 5:
        return 1
    elif val <= 20:
        return 2
    elif val <= 100:
        return 3
    elif val <= 500:
        return 4
    else:
        return 5

def make_categories(data):
    data['score_cat'] = data['score'].apply(lambda x: cat_num(x))
    data['comment_cat'] = data['num_comments'].apply(lambda x: cat_num(x))
    return data    


def prepare_text(data):
    data.title = data.title.apply(lambda s: s.replace('&amp;#x200B;', ''))
    data.selftext = data.selftext.apply(lambda s: s.replace('&amp;#x200B;', ''))
    newlines = re.compile(r'(\\n)+')
    #new_sent = re.compile(r'(\w+:\S*)')
    empty = re.compile(r'\s*')
    data['joke'] = data.apply(lambda row: ' </s> </s> '.join([row['title'], row['selftext']])))), axis=1)
    data['joke'] = ['<s> ' + ' </s> </s> '.join([j.strip() for j in new_sent.split(joke)
                               if not empty.fullmatch(j)]) + ' </s>' for joke in jokelist]
    return data


In [None]:
train_dj = make_categories(train_dj)
train_dj = prepare_text(train_dj)
train_dj.head()

In [None]:
with open('dadjokes-subreddit-archive/dadjokes_train.txt', 'w') as jokes:
    for joke in train_dj.joke:
        jokes.write(joke+'\n\n')

roberta.eval()

def _rob_encode(df, col):
    df[col + '_tokens'] = df[col].apply(lambda x: roberta.encode(x))
    return df

def roberta_encode(df):
    df = _rob_encode(df, 'title')
    df = _rob_encode(df, 'selftext')
    return df

train_dj = roberta_encode(train_dj)
train_jokes = roberta_encode(train_jokes)
train_dj.head()

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForSequenceClassification.from_pretrained('roberta-large')

In [None]:
train_dj['input_ids'] = train_dj.joke.apply(lambda x: tokenizer.encode(x))

In [None]:
train_dj['title_encoded'] = train_dj.title_doc.apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
train_dj['selftext_encoded'] = train_dj.selftext_doc.apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [None]:
print(input_ids)
print(labels)
print(outputs)
print(loss)
print(logits)

In [None]:
train_dj['toolong'] = train_dj.input_ids.apply(lambda ids: len(ids)>512)

In [None]:
long = train_dj[train_dj['toolong']]
print(len(long))
long

In [None]:
obama = train_dj.loc['cjpxky']
print(obama.input_ids)

In [None]:
docs = nlp.pipe(["I like Mr. Water. Water is good? No? NO! Well... Whatever\nuhat's that", "this...is good"])
for doc in docs:
    for sent in doc.sents:
        print(sent)

In [None]:
sent_detector.tokenize("I like Mr. Water. Water is good? No? NO! Well... Whatever\nuhat's that")


In [None]:
onelist