# Why do I present this project ?

There're several reasons:
1. This is a real-life problem. Data as raw as it could be
2. This will be an end-to-end project including: extract data, down size data, clean up, process language, and prediction model.

In [22]:
import os
import json
import pandas as pd
import codecs

# Extracting data

The first step of this project is to extract data from json format and export in pandas dataframe format.

In [16]:
review_only_filepath = os.path.join(data_directory,
                                    'yelp_academic_dataset_review.json')

data_directory = 'yelp_dataset_challenge_round9'
businesses_filepath = os.path.join(data_directory, 'yelp_academic_dataset_business.json')


In [17]:
restaurant_id = set()

with open(businesses_filepath) as f:
    for line in f:
        business = json.loads(line)
        #there's case the business is None
        if business['categories'] is not None and 'Restaurants' in business['categories']:
            restaurant_id.add(business['business_id'])
            

print '{}'.format(len(restaurant_id)), " total restaurants in the dataset."

48485  total restaurants in the dataset.


In [18]:
#getting into the habits that export file every modification make alonng the way
intermediate_directory = 'intermediate'
restaurant_review_text_filepath = os.path.join(intermediate_directory,'restaurant_review_name.txt')

In [31]:
%%time

if True:
    with codecs.open(restaurant_review_text_filepath,'w', encoding = 'utf_8') as res_review:
        with codecs.open(review_only_filepath,encoding = 'utf_8') as review_only_file:
            for line in review_only_file:
                review = json.loads(line)
                if review['business_id'] not in restaurant_id:
                    continue
                text = review['text'].encode('utf-8')
                res_review.write(str(review['business_id'])+ '\t' +review['text'].replace('\n', '\\n')+ 
                                 '\t'+ str(review['stars']) + '\n')
    print "Done grouping text"
else: 
    with open(restaurant_review_text_filepath) as review_txt_file:
        #means that there's a file already existed. 
        pass
        
    print "Already created"

Done grouping text
CPU times: user 1min 48s, sys: 4.46 s, total: 1min 52s
Wall time: 2min 4s


In [32]:
df = pd.read_csv(restaurant_review_text_filepath, sep = '\t',names = ['business', 'reviews', 'rating'])
df.head()

Unnamed: 0,business,reviews,rating
0,4P-vTvE6cncJyUyLh73pxw,This place is a area staple! Been around for y...,4.0
1,4P-vTvE6cncJyUyLh73pxw,Got my mojo back after having a few of their a...,4.0
2,4P-vTvE6cncJyUyLh73pxw,"Don't go here for the decor, but the staff is ...",4.0
3,4P-vTvE6cncJyUyLh73pxw,I believe in awarding stars bearing in mind th...,5.0
4,4P-vTvE6cncJyUyLh73pxw,"If you like fried food and laid back, then thi...",4.0


In [33]:
print "Number of row in data frame {}".format(df.shape[0])

Number of row in data frame 2578790


Because there're 2 million rows in the data set. It's too big for my computer, so I only used 10% of data to demonstrate the project. 

In [34]:
#while sampling the data set, it also randomized the dataframe. 
temp = df.sample(frac = 0.1)
temp.head()

Unnamed: 0,business,reviews,rating
2287541,Gaq3S9lmjXVcuDCZ8ulppw,I wasn't really impressed with this sushi join...,3.0
2087268,ZX9eujPNUxqWEWYdr4Ulqg,Wow! What a treat. Started off with a cup of t...,5.0
1369363,SZEFE5hL7aN5nM-A44iPwQ,Gorgeous and romantic interiors. Spectacular ...,5.0
1545381,7KkgMcbVaetryW1wwpzvvA,I had the Tuesday special - coconut shrimp ta...,3.0
1112117,Xra1TtWtf069Am5hHWs3Ug,Seriously good Italian food! Though the dining...,5.0


In [35]:
#export subset of the data
export_subset = os.path.join(intermediate_directory,'restaurant_review_name_subset.txt')
temp.to_csv(export_subset, header= ['business', 'reviews', 'rating'], index= False)

 This will end script #1, create another script to load the data and start processing

# Data pre-processing

In [36]:
df = pd.read_csv(export_subset)
df.head()

Unnamed: 0,business,reviews,rating
0,Gaq3S9lmjXVcuDCZ8ulppw,I wasn't really impressed with this sushi join...,3.0
1,ZX9eujPNUxqWEWYdr4Ulqg,Wow! What a treat. Started off with a cup of t...,5.0
2,SZEFE5hL7aN5nM-A44iPwQ,Gorgeous and romantic interiors. Spectacular ...,5.0
3,7KkgMcbVaetryW1wwpzvvA,I had the Tuesday special - coconut shrimp ta...,3.0
4,Xra1TtWtf069Am5hHWs3Ug,Seriously good Italian food! Though the dining...,5.0


Because reviews will have numbers, articles, emoji, and punctuation which have no value for the task, so we remove it and lowercase for all of the words

In [38]:
from string import ascii_lowercase
from nltk.corpus import stopwords
stop = stopwords.words('english')
#created a bigger stopwords library. 
stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight',
             'nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
stop = set(sorted(stop))

In [47]:
def clean(x):
    '''this function will remove all punctuations and numbers'''
    #import regular expression library to do string manipulations
    import re
    temp = str(x)
    temp = re.sub('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ',temp)
    temp = temp.lower().split(" ")
    return ' '.join([ i for i in temp if i not in stop])


In [48]:
#create another colummn for cleaned text
df['cleaned_text_data'] = df['reviews'].apply(clean)
df.head()

Unnamed: 0,business,reviews,rating,cleaned_text_data
0,Gaq3S9lmjXVcuDCZ8ulppw,I wasn't really impressed with this sushi join...,3.0,really impressed sushi joint typical ayce for...
1,ZX9eujPNUxqWEWYdr4Ulqg,Wow! What a treat. Started off with a cup of t...,5.0,wow treat started cup best coffee ever ask ...
2,SZEFE5hL7aN5nM-A44iPwQ,Gorgeous and romantic interiors. Spectacular ...,5.0,gorgeous romantic interiors spectacular food...
3,7KkgMcbVaetryW1wwpzvvA,I had the Tuesday special - coconut shrimp ta...,3.0,tuesday special coconut shrimp tacos rice b...
4,Xra1TtWtf069Am5hHWs3Ug,Seriously good Italian food! Though the dining...,5.0,seriously good italian food though dining roo...


In [50]:
#convert rating into a number
df['rating'] = df['rating'].apply(lambda x: float(x))

#sentiment for each review
df['sentiment'] = df['rating'].apply(lambda x: 'positive' if x >3 else 'negative')

df.head()

Unnamed: 0,business,reviews,rating,cleaned_text_data,sentiment
0,Gaq3S9lmjXVcuDCZ8ulppw,I wasn't really impressed with this sushi join...,3.0,really impressed sushi joint typical ayce for...,negative
1,ZX9eujPNUxqWEWYdr4Ulqg,Wow! What a treat. Started off with a cup of t...,5.0,wow treat started cup best coffee ever ask ...,positive
2,SZEFE5hL7aN5nM-A44iPwQ,Gorgeous and romantic interiors. Spectacular ...,5.0,gorgeous romantic interiors spectacular food...,positive
3,7KkgMcbVaetryW1wwpzvvA,I had the Tuesday special - coconut shrimp ta...,3.0,tuesday special coconut shrimp tacos rice b...,negative
4,Xra1TtWtf069Am5hHWs3Ug,Seriously good Italian food! Though the dining...,5.0,seriously good italian food though dining roo...,positive


In [69]:
#seperate positive and negative 
positive_text = df[df['sentiment'] ==  'positive']
negative_text = df[df['sentiment'] ==  'negative']

#keep positive : negative ration to be 1:1
pos_negative_ratio = len(negative_text)/float(len(positive_text))
temp = positive_text.sample(frac = pos_negative_ratio)
small_df = pd.concat([negative_text,temp])

#re-randomize data after concatenation. 
for _ in range(100):
    small_df = small_df.sample(frac = 1)
print "Checking numbers of positive and negative in dataframe"
print small_df.groupby('sentiment').count()

#split data into test set and train set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(small_df['cleaned_text_data'],small_df['sentiment'], test_size =0.3)

Checking numbers of positive and negative in dataframe
           business  reviews  rating  cleaned_text_data
sentiment                                              
negative      90589    90463   90403              90589
positive      90589    90589   90589              90589


# Choose machine learning model

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [76]:
%%time
text_clf = Pipeline([('vect', CountVectorizer(decode_error = 'ignore')),
                     ('clf', LogisticRegression(n_jobs = 7))
                    ])
text_clf.fit(x_train,y_train)
predicted = text_clf.predict(x_test)

print metrics.classification_report(y_test,predicted)


             precision    recall  f1-score   support

   negative       0.87      0.85      0.86     27158
   positive       0.85      0.87      0.86     27196

avg / total       0.86      0.86      0.86     54354

CPU times: user 2min 21s, sys: 2.67 s, total: 2min 24s
Wall time: 44.5 s


# Conclusion

The model only achieve 80% precision. It's better than randomly guessing. Need to refine hyperparameters and parameters. Need to run on the entire dataset. 