In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
with open("yelp/yelp_academic_dataset_review.json") as data_file:   
    data = data_file.readlines()
data_json_str = "[" + ','.join(data) + "]"
reviews = pd.read_json(data_json_str)

In [4]:
with open("yelp/yelp_academic_dataset_business.json") as data_file:   
    data = data_file.readlines()
data_json_str = "[" + ','.join(data) + "]"

business = pd.read_json(data_json_str)
def get_rest(r):
    try:
        return "Restaurants" in r["categories"]
    except:
        return False
restaurants = business.loc[business.apply(get_rest,axis=1),:]

In [5]:
useless_cols = ["type","date","review_id","user_id"]
reviews.drop(useless_cols,axis=1,inplace=True)
useful_reviews = reviews.loc[reviews.useful>0]

In [6]:
restaurants = restaurants.loc[~pd.isnull(restaurants.business_id)]
useful_reviews = useful_reviews.loc[~pd.isnull(useful_reviews.business_id)]

In [7]:
useful_rest_reviews = pd.merge(useful_reviews,restaurants,how="inner",on = "business_id")
useful_rest_reviews = useful_rest_reviews[["cool","funny","stars_x","text"]]

In [8]:
bad_indx = useful_rest_reviews.stars_x==1
bad_indx.append(useful_rest_reviews.stars_x==2)
good_reviews = useful_rest_reviews.loc[useful_rest_reviews.stars_x==5]
bad_reviews = useful_rest_reviews.loc[bad_indx]

In [13]:
good_reviews_sample = good_reviews.sample(10000)
bad_reviews_sample = bad_reviews.sample(10000)

[u'cool', u'funny', u'stars_x', u'text']

In [15]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')

def stem_words(com):
    # tokenize
    tokens = tokenizer.tokenize(com)
    
    stemmed = [p_stemmer.stem(t) for t in tokens]
    
    return ' '.join(stemmed)


stemmed_good_reviews = [stem_words(review) for review in good_reviews_sample.text]
stemmed_bad_reviews = [stem_words(review) for review in bad_reviews_sample.text]

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None, min_df=.008,max_df=.5,ngram_range=(2,2),
                             stop_words = "english") 

word_feature_list = vectorizer.fit_transform(good_reviews_sample.text)
word_features = pd.DataFrame(word_feature_list.toarray())
print(word_features.shape)

(10000, 126)


In [66]:
word_features.columns = vectorizer.get_feature_names()

In [69]:
word_counts = word_features.apply(sum,axis=0)
word_counts.head()
word_counts.columns = "Word Count"
word_counts.to_csv("good_reviews.csv",encoding="utf-8",index=True,header=True)

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None, min_df=.008,max_df=.5,ngram_range=(2,2),
                             stop_words = "english") 

word_feature_list = vectorizer.fit_transform(bad_reviews_sample.text)
word_features = pd.DataFrame(word_feature_list.toarray())
print(word_features.shape)

(10000, 161)


In [71]:
print vectorizer.get_feature_names()

[u'10 minutes', u'15 minutes', u'20 minutes', u'25 minutes', u'30 minutes', u'40 minutes', u'45 minutes', u'asked manager', u'asked wanted', u'avoid place', u'bad experience', u'bad food', u'bad service', u'better food', u'better service', u'big deal', u'came asked', u'came said', u'came table', u'chinese food', u'chips salsa', u'credit card', u'customer service', u'decided try', u'did come', u'didn bother', u'didn care', u'didn eat', u'didn know', u'didn like', u'didn want', u'dining experience', u'dining room', u'don care', u'don know', u'don like', u'don think', u'don want', u'don waste', u'drink order', u'excited try', u'extremely rude', u'far worst', u'fast food', u'feel like', u'felt like', u'finally came', u'finally got', u'food arrived', u'food bad', u'food came', u'food cold', u'food food', u'food good', u'food great', u'food horrible', u'food just', u'food mediocre', u'food ok', u'food order', u'food ordered', u'food place', u'food poisoning', u'food quality', u'food service'

In [72]:
word_features.columns = vectorizer.get_feature_names()
word_counts = word_features.apply(sum,axis=0)
word_counts.head()
word_counts.columns = "Word Count"
word_counts.to_csv("bad_reviews.csv",encoding="utf-8",index=True,header=True)