In [305]:
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
import enchant
from nltk.metrics import edit_distance

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import LinearSVC

# Loading data, JSON to CVS

In [306]:
REVIEWS = 100000
data_list = list()
columns = ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']

with open('yelp_training_set_review.json') as reviews:
    import json
    for i, line in enumerate(reviews):

        if i == REVIEWS:
            break
        data = json.loads(line)
        data_list.append([data['review_id'],
                          data['user_id'],
                          data['business_id'],
                          data['stars'],
                          data['votes']['useful'],
                          data['votes']['funny'],
                          data['votes']['cool'],
                          data['text'],
                          data['date']])

reviews.close()

del reviews, i, line, data, REVIEWS
df = pd.DataFrame(data_list, columns=columns)

In [357]:
df.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,review_year,sentiment,cleaned_text
0,fWKvX83p0-ka4JS3dc6E5A,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,5,0,2,My wife tok me here on my birthday for breakfa...,2011-01-26,2011,1,"[M, , , w, , f, e, , , , k, , , e, , h, e, r, ..."
1,IjZ33sJrzXqU-0X6U8NwyA,0a2KyEL0d3Yb1V6aivbIuQ,ZRJwVLyzEJq1VAihDhYiow,5,0,0,0,I have no idea why some people give bad review...,2011-07-27,2011,1,"[I, , h, , v, e, , n, , , , , e, , , w, h, , ,..."
2,IESLBzqUCLdSzSqm0eCSxQ,0hT2KtfLiobPvh6cDC8JQg,6oRAC4uyJCsJl1X0WZpVSA,4,1,0,0,love the gyro plate. Rice is so god and I also...,2012-06-14,2012,1,"[l, , v, e, , , h, e, , g, , r, , , p, l, , , ..."


In [308]:
df.shape

(100000, 9)

# Cleaning data

## Example

In [309]:
df.iloc[20].text

"DVAP....\n\nYou have to go at least once in your life. It really is a neat place with alot of history. \n\nThe service is great, it appears to be family run. \n\nThe food is good. Better then Dennys but not as good as Mimi's. \n\nI had the all u can eat of beef ribs, lasagna, meat loaf, cat fish, chicken, mashed and diced potatoes, stuffing, rice, homemade apple pie, etc and salad bar. I know I am missing a bunch of stuff they had but you get the drift. \n\nThey run specials on Prime rib and stuff so you might want to call to see what they are serving the night you go."

## Delete '\r\n', '\n\n', '\n', '\r'

In [310]:
# (Perkins) Python 3 Text Processing with NLTK 3 Cookbook
class Newline_Replacer(object):
    def replace(self, text):
        s = text
        s = s.replace('\r\n', ' ')
        s = s.replace('\n\n', ' ') 
        s = s.replace('\n', ' ')
        s = s.replace('\r', ' ') 
        return s

## Delete Extra spaces 

In [311]:
# (Perkins) Python 3 Text Processing with NLTK 3 Cookbook
class Extra_Spaces_Replacer(object):
    def replace(self, text):
        s = text
        s = re.sub('\s\s+', ' ', s)
        return s

## Delete Word reduction

In [312]:
#review with reduction
reduction_review = df['text'][16]
reduction_review

'We went here on a Saturday afternoon and this place was incredibly empty.  They had brunch specials going on, including $2 bloody mary\'s and mimosas, but we were more in the mood for lunch.  Except for the bloody mary, I had to try one.  It came out in a high-ball-sized glass.  Boo!  But it was really tasty. Yay!  The hubby remembered a sign outside the restaurant a few weeks back that said they had Arrogant Bastard, and he got a 22 oz bottle for $4.75.  Hey, that\'s not fair!!\n\nNext up: the wings.  We were a bit hesitant to order them when the waitress informed us that they are "seasoned" but not sauced, so they can\'t be ordered hot.  We did ask for them crispy though, and the waitress even asked the cooks to throw them back in for a few minutes when they came out not visibly crispy.  These non-traditional wings were actually pretty damn good.  The seasoning was a little spicy and salty with just a hint of sweet.  If I were in the mood for the tang and kick of Frank\'s Hot Sauce,

In [313]:
# (Perkins) Python 3 Text Processing with NLTK 3 Cookbook
rep_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'can not'),
    (r'i\'m', 'i am'),
    (r'I\'m', 'I am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would'),
]

In [314]:
# (Perkins) Python 3 Text Processing with NLTK 3 Cookbook
class Word_Reduction_Replacer(object):
    def __init__(self, patterns=rep_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
    def replace(self, text):
        s = text

        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)

        return s

## Delete repeating letters

In [315]:
amazing_reviews = df[df.text.str.find('AAAMMMAZZING') != -1]
amazing_reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
1443,Vnr2wuBXEjbLtfQT_XuDSQ,UrXg7zOknA7WAHD60JnK9g,YCCDMLcb7UW8G-o_HsWiiA,5,0,0,0,Awesome food anyone??? Go check out Modern Ste...,2011-04-03


In [316]:
a_review = amazing_reviews['text'][1443]
a_review

'Awesome food anyone??? Go check out Modern Steak in Scottsdale, AZ.\nGorgeous dining room! Excellent service (with our server Gabe)! And the food was AAAMMMAZZING!\n\nIt\'s located at Fashion Mall, but it\'s NOT a "mall" restaurant. A MUST go!'

In [317]:
# (Perkins) Python 3 Text Processing with NLTK 3 Cookbook 
class Repeat_Replacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word
        
        repl_word = self.repeat_regexp.sub(self.repl, word)

        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

## Spelling Correction

In [318]:
# (Perkins) Python 3 Text Processing with NLTK 3 Cookbook

class Spelling_Replacer(object):
    def __init__(self, dict_name='en', max_dist=1):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
    
        suggestions = self.spell_dict.suggest(word)
    
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

In [319]:
newline_replacer = Newline_Replacer()
extra_spaces_replacer = Extra_Spaces_Replacer()
word_reduction_replacer = Word_Reduction_Replacer()
repeat_replacer = Repeat_Replacer()
spell_replacer = Spelling_Replacer()

## Apply cleaning functions

In [320]:
df['text'] = df['text'].apply(newline_replacer.replace)

In [321]:
df['text'] = df['text'].apply(extra_spaces_replacer.replace)

In [322]:
df['text'] = df['text'].apply(word_reduction_replacer.replace)

In [323]:
df['text'] = df['text'].apply(repeat_replacer.replace)

In [325]:
#df['text'] = df['text'].apply(spell_replacer.replace)

In [326]:
df.head(4)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,fWKvX83p0-ka4JS3dc6E5A,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,5,0,2,My wife tok me here on my birthday for breakfa...,2011-01-26
1,IjZ33sJrzXqU-0X6U8NwyA,0a2KyEL0d3Yb1V6aivbIuQ,ZRJwVLyzEJq1VAihDhYiow,5,0,0,0,I have no idea why some people give bad review...,2011-07-27
2,IESLBzqUCLdSzSqm0eCSxQ,0hT2KtfLiobPvh6cDC8JQg,6oRAC4uyJCsJl1X0WZpVSA,4,1,0,0,love the gyro plate. Rice is so god and I also...,2012-06-14
3,G-WvGaISbqqaMHlNnByodA,uZetl9T0NcROGOyFfughhg,_1QQZuf4zZOyFCvXc0o6Vg,5,2,0,1,"Rosie, Dakota, and I LOVE Chaparal Dog Park!!!...",2010-05-27


In [327]:
df['review_year'] = df['date'].apply(lambda x:int(x[:4]))

In [328]:
df.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,review_year
0,fWKvX83p0-ka4JS3dc6E5A,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,5,0,2,My wife tok me here on my birthday for breakfa...,2011-01-26,2011
1,IjZ33sJrzXqU-0X6U8NwyA,0a2KyEL0d3Yb1V6aivbIuQ,ZRJwVLyzEJq1VAihDhYiow,5,0,0,0,I have no idea why some people give bad review...,2011-07-27,2011
2,IESLBzqUCLdSzSqm0eCSxQ,0hT2KtfLiobPvh6cDC8JQg,6oRAC4uyJCsJl1X0WZpVSA,4,1,0,0,love the gyro plate. Rice is so god and I also...,2012-06-14,2012


## Destribution of reviews by year

In [329]:
df.groupby(['review_year'])['review_id'].size().reset_index()

Unnamed: 0,review_year,review_id
0,2005,47
1,2006,577
2,2007,2878
3,2008,7421
4,2009,11634
5,2010,18378
6,2011,27674
7,2012,30767
8,2013,624


## Distribution of reviews by star

In [330]:
df.groupby(['stars'])['review_id'].size().reset_index()


Unnamed: 0,stars,review_id
0,1,7638
1,2,9078
2,3,15311
3,4,34643
4,5,33330


## Sentiment flag

If stars >= 4, sentiment = 1
If starts < 4, sentiment = 0

In [331]:
df['sentiment'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)

In [332]:
df.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,review_year,sentiment
0,fWKvX83p0-ka4JS3dc6E5A,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,5,0,2,My wife tok me here on my birthday for breakfa...,2011-01-26,2011,1
1,IjZ33sJrzXqU-0X6U8NwyA,0a2KyEL0d3Yb1V6aivbIuQ,ZRJwVLyzEJq1VAihDhYiow,5,0,0,0,I have no idea why some people give bad review...,2011-07-27,2011,1
2,IESLBzqUCLdSzSqm0eCSxQ,0hT2KtfLiobPvh6cDC8JQg,6oRAC4uyJCsJl1X0WZpVSA,4,1,0,0,love the gyro plate. Rice is so god and I also...,2012-06-14,2012,1


In [333]:
df = df.drop([])

## Removing stop words

In [338]:
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

df['cleaned_text'] = df['text'].apply(lambda x: remove_stop_words(x))

## Lemmatization

In [341]:
nltk.download('wordnet')

def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

df['cleaned_text'] = df['cleaned_text'].apply(lambda x: get_lemmatized_text(x))

[nltk_data] Downloading package wordnet to /home/ilya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [342]:
df.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,review_year,sentiment,cleaned_text
0,fWKvX83p0-ka4JS3dc6E5A,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,5,0,2,My wife tok me here on my birthday for breakfa...,2011-01-26,2011,1,"[M, , , w, , f, e, , , , k, , , e, , h, e, r, ..."
1,IjZ33sJrzXqU-0X6U8NwyA,0a2KyEL0d3Yb1V6aivbIuQ,ZRJwVLyzEJq1VAihDhYiow,5,0,0,0,I have no idea why some people give bad review...,2011-07-27,2011,1,"[I, , h, , v, e, , n, , , , , e, , , w, h, , ,..."
2,IESLBzqUCLdSzSqm0eCSxQ,0hT2KtfLiobPvh6cDC8JQg,6oRAC4uyJCsJl1X0WZpVSA,4,1,0,0,love the gyro plate. Rice is so god and I also...,2012-06-14,2012,1,"[l, , v, e, , , h, e, , g, , r, , , p, l, , , ..."


## Create Train/Validation/Test Split

In [343]:
X_train_set, X_nottrain, y_train, y_nottrain = train_test_split(df['text'], 
                                                    df['sentiment'], 
                                                    test_size=0.30, 
                                                    random_state=42)

X_test_set, X_val_set, y_test, y_val = train_test_split(X_nottrain, 
                                                    y_nottrain, 
                                                    test_size=0.5, 
                                                    random_state=42)

## CountVectorizer

In [347]:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit(X_train_set)
X_train = cv.transform(X_train_set)
X_test = cv.transform(X_test_set)
X_val = cv.transform(X_val_set)

In [348]:
X_train.shape

(70000, 1527271)

In [358]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(solver = 'lbfgs', 
                            penalty = 'l2', 
                            C=c,
                           max_iter= 500)
    lr.fit(X_train, y_train)
    print ("For C = %s, Accuracy: %s, F1-score: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val)), f1_score(y_val, lr.predict(X_val))))

For C = 0.01, Accuracy: 0.8686666666666667, F1-score: 0.9072766638426057
For C = 0.05, Accuracy: 0.8742666666666666, F1-score: 0.9102161287251261




For C = 0.25, Accuracy: 0.8724, F1-score: 0.9081573896353168




For C = 0.5, Accuracy: 0.8712, F1-score: 0.9070707070707071
For C = 1, Accuracy: 0.8709333333333333, F1-score: 0.9067796610169492




In [354]:
final_model = LogisticRegression(solver = 'lbfgs', 
                            penalty = 'l2', 
                            C=0.05,
                           max_iter= 500)
final_model.fit(X_train, y_train)
print ("Final Accuracy: %s, F1-score: %s" 
           % (accuracy_score(y_test, final_model.predict(X_test)), f1_score(y_test, final_model.predict(X_test))))

Final Accuracy: 0.8708666666666667, F1-score: 0.9080639802553515




In [351]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
print("Best Positive")
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:15]:
    print (best_positive)

print("Best Negative")

for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:15]:
    print (best_negative)

Best Positive
('amazing', 1.298368034218033)
('awesome', 1.1993725960498127)
('be disapointed', 1.1848377465096565)
('excelent', 1.0995227169481077)
('fantastic', 1.0952414410875522)
('best', 1.0166890693436077)
('delicious', 1.0104628696811564)
('love this', 0.9940241156674842)
('wonderful', 0.9925348688857923)
('incredible', 0.9528211393786559)
('five stars', 0.9375527983160354)
('not disapointed', 0.9200679316137024)
('outstanding', 0.8971206036022086)
('so god', 0.8805296311496796)
('not only', 0.8644627862287049)
Best Negative
('worst', -1.6254305221227119)
('mediocre', -1.5615553237484752)
('not worth', -1.4783554211679928)
('meh', -1.3696830951518075)
('thre stars', -1.3242540881111635)
('overpriced', -1.3237119176294783)
('not great', -1.2687255471737127)
('terible', -1.256673888292559)
('awful', -1.2124375224771267)
('bland', -1.2089216347185894)
('rude', -1.1849028562582733)
('horible', -1.1711069667040277)
('sucks', -1.1594219862716861)
('at best', -1.1485669128888518)
('two

## Baseline SVM

In [352]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c, max_iter=500)
    svm.fit(X_train, y_train)
    print ("For C = %s, Accuracy: %s, F1-score: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val)), f1_score(y_val, svm.predict(X_val))))

For C = 0.01, Accuracy: 0.8718666666666667, F1-score: 0.9081437583636016
For C = 0.05, Accuracy: 0.8650666666666667, F1-score: 0.9023354564755839




For C = 0.25, Accuracy: 0.8608666666666667, F1-score: 0.898792493089569
For C = 0.5, Accuracy: 0.8574, F1-score: 0.8960893854748605
For C = 1, Accuracy: 0.8540666666666666, F1-score: 0.8936294280577286


In [353]:
final_model = LinearSVC(C=0.01, max_iter=500)

final_model.fit(X_train, y_train)
print ("Final Accuracy: %s, F1-score: %s" 
           % (accuracy_score(y_test, final_model.predict(X_test)), f1_score(y_test, final_model.predict(X_test))))

Final Accuracy: 0.8678, F1-score: 0.9055669317586552
