In [1]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
with open('jokes/joke-dataset/stupidstuff.json') as json_data:
    ss_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/reddit_jokes.json') as json_data:
    reddit_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/wocka.json') as json_data:
    wocka_df = pd.DataFrame(json.load(json_data,))

pd.set_option('display.max_colwidth', -1)
    
ss_df.drop(['category', 'id'], axis=1, inplace=True)
ss_df.rename(index=str, columns={'body': 'joke', 'rating': 'score'}, inplace=True)

reddit_df['joke'] = reddit_df['title'] + " " + reddit_df['body']
reddit_df.drop(['body', 'id', 'title'], axis=1, inplace=True)

wocka_df.drop(['category', 'id', 'title'], axis=1, inplace=True)
wocka_df.rename(index=str, columns={'body': 'joke'}, inplace=True)

In [3]:
reddit_df.loc[reddit_df['score'] == 0, ['score']] = 1
reddit_df.loc[(reddit_df['score'] > 0) & (reddit_df['score'] < 3), ['score']] = 2
reddit_df.loc[(reddit_df['score'] >= 3) & (reddit_df['score'] < 16), ['score']] = 3
reddit_df.loc[(reddit_df['score'] >= 16) & (reddit_df['score'] < 50), ['score']] = 4
reddit_df.loc[(reddit_df['score'] >= 50) & (reddit_df['score'] < 50000), ['score']] = 5

df = pd.concat([ss_df, reddit_df])
df.round({'score': 0})
df['score'] =  df['score'].apply(np.int64)
df = df.sample(frac=1).reset_index(drop=True)
#df['joke'].str.lower()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [4]:
num_words = [len(row['joke'].split()) for _, row in df.iterrows()]
print("median words: ", np.median(num_words))
print("average words: ", np.average(num_words))

median words:  18.0
average words:  47.74331151740064


In [5]:
def process_text(text):
    
    #remove punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #take out stopwords; clean up each text message
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

    return clean_words

In [None]:
df['joke'].apply(process_text)

In [20]:
joke_train, joke_test, score_train, score_test = train_test_split(df['joke'], df['score'], test_size=0.2)

In [21]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=process_text)), # converts strings to integer counts
    ('tfidf', TfidfTransformer()), # converts integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()) # train on TF-IDF vectors with Naive Bayes classifier
])

In [23]:
pipeline.fit(joke_train, score_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function process_text at 0x7fb99637eae8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=No...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [24]:
predictions = pipeline.predict(joke_test)

In [51]:
count = 0
score_test_vals = score_test.values
for i in range(len(predictions)):
    if (predictions[i] == score_test_vals[i]):
        count += 1

In [54]:
print("Amount correct: ", count)
print("Fraction: ", count/len(predictions)*100)

Amount correct:  23458
Fraction:  59.13880905561438
