In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import string
import unicodedata

import nltk

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from nltk import pos_tag
from nltk import RegexpParser

from sklearn.preprocessing import StandardScaler

In [6]:
! ls

[34mdataset-examples[m[m      spark capstone2.ipynb [34myelp_dataset[m[m
sarc_09-12.csv        viewing data.ipynb


## Load and convert data ##

In [7]:
# read in Yelp Reviews
  
import json

data = [json.loads(line) for line in open('yelp_dataset/yelp_academic_dataset_review.json', 'r')]

In [91]:
reviews_df = pd.DataFrame(data)

In [92]:
reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4.0,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5.0,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2.0,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4.0,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


In [93]:
len(reviews_df)

8635403

In [95]:
# get 200k toy dataframe of balanced funny/not funny reviews

len(reviews_df[reviews_df.funny >= 3])

340864

In [97]:
funny_df = reviews_df[reviews_df.funny >= 3].sample(n=100000, random_state=1)
not_funny_df = reviews_df[reviews_df.funny == 0].sample(n=100000, random_state=1)

In [98]:
# save toy datasets to csv

funny_df.to_csv('funny.csv')
not_funny_df.to_csv('notfunny.csv')

In [99]:
# read in and concatenate

funny_df = pd.read_csv('funny.csv')
not_funny_df = pd.read_csv('notfunny.csv')

df = pd.concat([funny_df, not_funny_df]).drop_duplicates('review_id')
df.shape

(200000, 10)

In [100]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [101]:
df.reset_index(drop=True)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,DEECPhn5tNBKbuaOeIcd2w,CwBh8Fmr07jFYBKZ2p6Mqg,iNsMwBxFXcP7QvB3994mgA,5.0,3,4,4,A scavenger hunt was hosted by the Paramount r...,2009-07-31 02:33:55
1,Ja7ZM1i_gYFXmxrkJPEPlA,ncIHRw4Vx3Wh9mZhLZxB7g,dgprzdonZcF7PPt-hkUkag,5.0,15,3,6,The Pretty Kitty has 20 locations throughout t...,2015-08-04 00:01:50
2,9E8ZAppLWtL0oJYRbFmOFQ,mdP18-vCtV9nu8DT2Bk6lA,gYuTx4cB_5UutuWIGusmUQ,5.0,14,6,9,After a very productive business meeting this ...,2018-10-15 13:49:54
3,MHBTWs3E8bKR80SOpcelWw,fV4Y0rKuJaahlNfjCH3vfw,ZM3Y7pPuKu67Q_FOLqGx7A,4.0,1,4,0,I called in advance to be sure that they had t...,2012-08-25 01:39:58
4,EcvcH3hHuzlXqG79nIag0w,ys3M-lvO7wCYkhCbqSVCfA,p8ohzzGvGRCHnJKnyO7exA,4.0,0,4,0,Eastern Standard reminds me of that scene in S...,2014-11-08 23:22:35
...,...,...,...,...,...,...,...,...,...
199995,IT0R9rk2Z2CzV0bFQw8Owg,eVILQOWgDcJ-Fj-YbGKUnQ,YQGPI7PvX01FUgjlLbpMuw,5.0,0,0,0,"This restaurant is AMAZING! The food, the serv...",2021-01-19 01:33:29
199996,7LN9HbMsl50uB3idG0cq2A,8O6M8da54DRiXczteMNkUw,f08CbQMvYX2TMmYND7rMHA,5.0,0,0,0,Look. If you want a place that is decent with ...,2020-05-31 01:51:47
199997,nSBxAx1l6oCbjAR8dVnV7A,FaDFbRDYL3f4vLc7ALhX-g,drmZouioJ13G_vGcpdmOwg,4.0,0,0,0,Newsflash: A nearly once-in-a-lifetime event i...,2017-07-01 22:15:27
199998,lJwInWxgS8fcN61v06yjdg,00E-ktOYmjPdE8owupjC8A,4U2bhcleBtWAWNNt9jhjbA,5.0,1,0,1,Nick Dew deserves 5 stars! One of the best wai...,2019-01-05 00:56:06


In [69]:
## another way without using dataframe
## filtering for reviews where at least 3 funny votes

total = 0
num_funny = 0
funny_threshold = 3

with open('yelp_dataset/yelp_academic_dataset_review.json') as f:
    for line in f:
        total += 1
        reviews = json.loads(line)
        if total == 1:
            # Let's print the very first review, just to see what it looks like
            print(reviews)
        if reviews['funny'] >= funny_threshold:
            num_funny += 1
            
print('\nTotal reviews: %d\nFunny reviews: %d\nNormal reviews: %d' % (total, num_funny, total - num_funny))

{'review_id': 'lWC-xP3rd6obsecCYsGZRg', 'user_id': 'ak0TdVmGKo4pwqdJSTLwWw', 'business_id': 'buF9druCkbuXLX526sGELQ', 'stars': 4.0, 'useful': 3, 'funny': 1, 'cool': 1, 'text': "Apparently Prides Osteria had a rough summer as evidenced by the almost empty dining room at 6:30 on a Friday night. However new blood in the kitchen seems to have revitalized the food from other customers recent visits. Waitstaff was warm but unobtrusive. By 8 pm or so when we left the bar was full and the dining room was much more lively than it had been. Perhaps Beverly residents prefer a later seating. \n\nAfter reading the mixed reviews of late I was a little tentative over our choice but luckily there was nothing to worry about in the food department. We started with the fried dough, burrata and prosciutto which were all lovely. Then although they don't offer half portions of pasta we each ordered the entree size and split them. We chose the tagliatelle bolognese and a four cheese filled pasta in a creamy 

In [81]:
## change count to more if time

funny = []
not_funny = []

count = 100000
funny_threshold = 3
raw_data_path = 'yelp_dataset/yelp_academic_dataset_review.json'

with open(raw_data_path) as file:
    for line in file:
        # Break when we've created a balanced dataset with count reviews in each category
        if len(funny) == count and len(not_funny) == count:
            break
                    
        sample = {}
        reviews = json.loads(line)
        sample['text'] = reviews['text']
                    
        if reviews['funny'] >= funny_threshold and len(funny) < count:
            sample['label'] = 1
            funny.append(sample)
        elif reviews['funny'] < funny_threshold and len(not_funny) < count:
            sample['label'] = 0
            not_funny.append(sample)
                    

In [82]:
reviews = funny + not_funny

In [89]:
type(reviews)

list

## Text Processing Pipeline ##

In [61]:
# Defining Humor
# First pass we define a review as funny if it has at least 1 funny vote. May want to revisit this to see
# if reviews are actually funny or not

from nltk.tokenize import word_tokenize
import spacy


In [63]:
# new df

new_df = df[['text', 'funny']]
new_df['funny or not'] = (df.funny != 0).astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['funny or not'] = (df.funny != 0).astype(int)


In [64]:
new_df = new_df.drop(columns='funny')

In [None]:
# Tokenization (don't filter stopwords or punctuation as these may contribute to humor)

# remove accents and lowercase



tokens_lower = [[word.lower() for word in sent]
                 for sent in tokens]

In [None]:
# lemmatization



In [None]:
# normalize data


In [None]:
# bag of words, max features 500



In [None]:
# count vectorization, TF-IDF



In [None]:
# latent semantic analysis


## Exploratory Data Analysis ##

## Modeling ##

In [None]:
# Recurrent Neural Network

