In [74]:
import pandas as pd
import numpy as np
import json 
import csv

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

from nltk.corpus import stopwords
stop = stopwords.words('english')
from textblob import Word, TextBlob
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from sklearn.decomposition import PCA
from tqdm import tqdm

%matplotlib inline 

In [75]:
raw_data = pd.read_csv("data.csv")

In [76]:
raw_data.head(5)

Unnamed: 0,business_id,cool_x,date,funny_x,review_id,stars,text,useful_x,user_id,average_stars,...,compliment_writer,cool_y,elite,fans,friends,funny_y,name,review_count,useful_y,yelping_since
0,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg,3.97,...,2,67,2009.0,3,"6IAFtw20e4y99D2cA3jj9g, 33O8Q0BdTl-PXh2lKceYgg...",36,Jackie,68,178,2008-08-26 20:56:20
1,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw,4.27,...,0,1,,1,"9YRFLVFVbHGAJ1weFx5zqw, hUgdQsMQPoNc3fa9VoMqWQ",1,Wilhelmina,10,13,2012-10-30 00:56:00
2,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5.0,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg,4.0,...,0,0,,0,"zWEBQSaDqrB9l8AWGy06DQ, VDFV_x-IyCNkgtr2EB-VLw...",1,Simon,4,2,2016-09-26 21:56:30
3,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ,2.0,...,0,0,,0,"jrw1jRpoRgK6ZQ4Z5_hPng, GJgchmwqQghjuG14uWgIOA...",0,Reilly,2,7,2016-12-06 22:12:48
4,eU_713ec6fTGNO4BegRaww,0,2013-01-20 13:25:59,0,fdiNeiN_hoCxCMy2wTRW9g,4.0,I'll be the first to admit that I was not exci...,0,w31MKYsNFMrjhWxxAb5wIw,4.0,...,0,0,,0,,0,A,1,0,2013-01-20 13:25:51


In [77]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399999 entries, 0 to 399998
Data columns (total 30 columns):
business_id           399999 non-null object
cool_x                399999 non-null int64
date                  399999 non-null object
funny_x               399999 non-null int64
review_id             399999 non-null object
stars                 399999 non-null float64
text                  399999 non-null object
useful_x              399999 non-null int64
user_id               399999 non-null object
average_stars         399999 non-null float64
compliment_cool       399999 non-null int64
compliment_cute       399999 non-null int64
compliment_funny      399999 non-null int64
compliment_hot        399999 non-null int64
compliment_list       399999 non-null int64
compliment_more       399999 non-null int64
compliment_note       399999 non-null int64
compliment_photos     399999 non-null int64
compliment_plain      399999 non-null int64
compliment_profile    399999 non-null int64


pandas.core.series.Series

In [78]:
#metrics to explain the reviews
def review_metrics(df):
    #create count of number of words
    df["word_count"]=df["text"].apply(lambda x: len(str(x).split(" ")))
   
    #create count for number of characters
    df["char_count"]=df["text"].str.len()
    
    #average word length
    def avg_word(sentence):
        words = sentence.split()
        return (sum(len(word) for word in words)/len(words))
    
    df["avg_word_len"]=df["text"].apply(lambda x: avg_word(x))
    
    #number of stop words
    df["num_stopwords"]=df["text"].apply(lambda x:len([x for x in x.split() if x in stop]))
    
    #percentage of non_stop words
    df["non_stop_percent"]= round(1-(df["num_stopwords"]/df["word_count"]),3)
    
    #TextBlob polarity value
    df["polarity"]=df.apply(lambda row: TextBlob(row.text).sentiment.polarity, axis=1)
    
    #TextBlob subjectivity value
    df["subjectivity"]=df.apply(lambda row: TextBlob(row.text).sentiment.subjectivity, axis=1)
    
    return df

In [79]:
review_metrics(raw_data)

Unnamed: 0,business_id,cool_x,date,funny_x,review_id,stars,text,useful_x,user_id,average_stars,...,review_count,useful_y,yelping_since,word_count,char_count,avg_word_len,num_stopwords,non_stop_percent,polarity,subjectivity
0,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg,3.97,...,68,178,2008-08-26 20:56:20,288,1561,4.627737,101,0.649,0.315810,0.536668
1,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw,4.27,...,10,13,2012-10-30 00:56:00,113,615,4.614679,52,0.540,0.429125,0.584000
2,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5.0,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg,4.00,...,4,2,2016-09-26 21:56:30,65,407,4.955882,23,0.646,0.832500,0.885000
3,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ,2.00,...,2,7,2016-12-06 22:12:48,673,3509,4.215453,308,0.542,0.049277,0.432407
4,eU_713ec6fTGNO4BegRaww,0,2013-01-20 13:25:59,0,fdiNeiN_hoCxCMy2wTRW9g,4.0,I'll be the first to admit that I was not exci...,0,w31MKYsNFMrjhWxxAb5wIw,4.00,...,1,0,2013-01-20 13:25:51,295,1547,4.213559,123,0.583,0.238157,0.619437
5,3fw2X5bZYeW9xCz_zGhOHg,5,2016-05-07 01:21:02,4,G7XHMxG0bx9oBJNECG4IFg,3.0,Tracy dessert had a big name in Hong Kong and ...,5,jlu4CztcSxrKx56ba1a5AQ,3.75,...,600,4346,2015-06-18 22:57:20,243,1274,4.196721,99,0.593,0.148851,0.545361
6,uF86ZhygpBEGr3CudNemYA,10,2015-11-06 13:39:39,4,PEkYyTrqRjXN5mOFBn5fzw,4.0,O'Noir was an awesome dining experience. I hav...,11,jlu4CztcSxrKx56ba1a5AQ,3.75,...,600,4346,2015-06-18 22:57:20,307,1688,4.391026,127,0.586,0.095464,0.579773
7,YXjD1_p0mvZkdI8S_RKdnA,10,2018-03-16 13:46:13,5,-li7zeP2cg4pocxH4iNTug,5.0,I grabbed a quick poke bowl today before my bu...,9,jlu4CztcSxrKx56ba1a5AQ,3.75,...,600,4346,2015-06-18 22:57:20,264,1346,4.064151,100,0.621,0.191616,0.563775
8,0MCXyrNQGaqdYBcAwh3anQ,8,2018-02-03 03:17:00,5,aujNf9GfrMExBNKDHDDbQg,4.0,"For the price of the sushi, it would be four s...",8,jlu4CztcSxrKx56ba1a5AQ,3.75,...,600,4346,2015-06-18 22:57:20,196,1038,4.227273,90,0.541,0.430655,0.515476
9,LL7EY-YARSY41n98P9509w,6,2016-10-28 03:01:48,3,yNgwJLQRJSgF-LIUHf6cfQ,4.0,Izakaya Tsuki is located on the second round a...,4,jlu4CztcSxrKx56ba1a5AQ,3.75,...,600,4346,2015-06-18 22:57:20,244,1298,4.256098,103,0.578,0.294764,0.499757


In [80]:
def clean_user_data(df):
    
    #create list of compliment features
    compliments=["compliment_cool",
             "compliment_cute",
             "compliment_funny",
             "compliment_hot",
             "compliment_list",
             "compliment_more",
             "compliment_note",
             "compliment_photos",
             "compliment_plain",
             "compliment_profile",
             "compliment_writer"]
    
    df["compliment_total"]=float(0.00)
    
    for comp in compliments:
        df["compliment_total"]+=df[comp]
        
    for comp in compliments:
        df[str("average_"+comp)]=float(0.00)
        
    for comp in compliments:
        df[str("average_"+comp)]=round(df[comp]/df["compliment_total"],2)
    
    #create list of review_rating features
    review_rating=["cool_y",
               "funny_y",
               "useful_y"]
    
    for rating in review_rating:
        df[str("average_"+rating)]=float(0.00)

    for rating in review_rating:
        df[str("average_"+rating)]=round(df[rating]/df["review_count"],2)
    
    
    
    df["friend_count"]=df.apply(lambda row: len(row.friends.split(",")), axis=1)
            
    #fill null values
    df.fillna(0,inplace=True)
    
    return df

In [81]:
clean_user_data(raw_data)

Unnamed: 0,business_id,cool_x,date,funny_x,review_id,stars,text,useful_x,user_id,average_stars,...,average_compliment_more,average_compliment_note,average_compliment_photos,average_compliment_plain,average_compliment_profile,average_compliment_writer,average_cool_y,average_funny_y,average_useful_y,friend_count
0,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg,3.97,...,0.04,0.12,0.02,0.21,0.02,0.04,0.99,0.53,2.62,52
1,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw,4.27,...,0.00,0.50,0.00,0.50,0.00,0.00,0.10,0.10,1.30,2
2,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5.0,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg,4.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.25,0.50,23
3,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ,2.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,3.50,335
4,eU_713ec6fTGNO4BegRaww,0,2013-01-20 13:25:59,0,fdiNeiN_hoCxCMy2wTRW9g,4.0,I'll be the first to admit that I was not exci...,0,w31MKYsNFMrjhWxxAb5wIw,4.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1
5,3fw2X5bZYeW9xCz_zGhOHg,5,2016-05-07 01:21:02,4,G7XHMxG0bx9oBJNECG4IFg,3.0,Tracy dessert had a big name in Hong Kong and ...,5,jlu4CztcSxrKx56ba1a5AQ,3.75,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
6,uF86ZhygpBEGr3CudNemYA,10,2015-11-06 13:39:39,4,PEkYyTrqRjXN5mOFBn5fzw,4.0,O'Noir was an awesome dining experience. I hav...,11,jlu4CztcSxrKx56ba1a5AQ,3.75,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
7,YXjD1_p0mvZkdI8S_RKdnA,10,2018-03-16 13:46:13,5,-li7zeP2cg4pocxH4iNTug,5.0,I grabbed a quick poke bowl today before my bu...,9,jlu4CztcSxrKx56ba1a5AQ,3.75,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
8,0MCXyrNQGaqdYBcAwh3anQ,8,2018-02-03 03:17:00,5,aujNf9GfrMExBNKDHDDbQg,4.0,"For the price of the sushi, it would be four s...",8,jlu4CztcSxrKx56ba1a5AQ,3.75,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
9,LL7EY-YARSY41n98P9509w,6,2016-10-28 03:01:48,3,yNgwJLQRJSgF-LIUHf6cfQ,4.0,Izakaya Tsuki is located on the second round a...,4,jlu4CztcSxrKx56ba1a5AQ,3.75,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903


In [82]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399999 entries, 0 to 399998
Data columns (total 53 columns):
business_id                   399999 non-null object
cool_x                        399999 non-null int64
date                          399999 non-null object
funny_x                       399999 non-null int64
review_id                     399999 non-null object
stars                         399999 non-null float64
text                          399999 non-null object
useful_x                      399999 non-null int64
user_id                       399999 non-null object
average_stars                 399999 non-null float64
compliment_cool               399999 non-null int64
compliment_cute               399999 non-null int64
compliment_funny              399999 non-null int64
compliment_hot                399999 non-null int64
compliment_list               399999 non-null int64
compliment_more               399999 non-null int64
compliment_note               399999 non-null in

In [83]:
raw_data.drop(["cool_x","funny_x","useful_x"],axis=1,inplace=True)

In [84]:
#Cleaning data
def clean_review(df):
    
    df["text"] = df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
    
    df["text"] = df["text"].str.replace("[^\w\s]","")
            
    return df

In [85]:
clean_review(raw_data)

Unnamed: 0,business_id,date,review_id,stars,text,user_id,average_stars,compliment_cool,compliment_cute,compliment_funny,...,average_compliment_more,average_compliment_note,average_compliment_photos,average_compliment_plain,average_compliment_profile,average_compliment_writer,average_cool_y,average_funny_y,average_useful_y,friend_count
0,NZnhc2sEQy3RmzKTZnqtwQ,2017-01-14 21:30:33,GJXCdrto3ASJOqKeVWPi6Q,5.0,i adore travis at the hard rocks new kelly car...,yXQM5uF2jS6es16SJzNHfg,3.97,10,1,10,...,0.04,0.12,0.02,0.21,0.02,0.04,0.99,0.53,2.62,52
1,WTqjgwHlXbSFevF32_DJVw,2016-11-09 20:09:03,2TzJjDVDEuAW6MR5Vuc1ug,5.0,i have to say that this office really has it t...,n6-Gk65cPZL6Uz8qRm3NYw,4.27,0,0,0,...,0.00,0.50,0.00,0.50,0.00,0.00,0.10,0.10,1.30,2
2,ikCg8xy5JIg_NGPx-MSIDA,2018-01-09 20:56:38,yi0R0Ugj_xUx_Nek0-_Qig,5.0,went in for a lunch steak sandwich was delicio...,dacAIZ6fTM6mqwW5uxkskg,4.00,0,0,0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.25,0.50,23
3,b1b1eb3uo-w561D0ZfCEiQ,2018-01-30 23:07:38,11a8sVPMUFtaC7_ABRkmtw,1.0,today was my second out of three sessions i ha...,ssoyf2_x0EQMed6fgHeMyQ,2.00,0,0,0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,3.50,335
4,eU_713ec6fTGNO4BegRaww,2013-01-20 13:25:59,fdiNeiN_hoCxCMy2wTRW9g,4.0,ill be the first to admit that i was not excit...,w31MKYsNFMrjhWxxAb5wIw,4.00,0,0,0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1
5,3fw2X5bZYeW9xCz_zGhOHg,2016-05-07 01:21:02,G7XHMxG0bx9oBJNECG4IFg,3.0,tracy dessert had a big name in hong kong and ...,jlu4CztcSxrKx56ba1a5AQ,3.75,280,5,280,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
6,uF86ZhygpBEGr3CudNemYA,2015-11-06 13:39:39,PEkYyTrqRjXN5mOFBn5fzw,4.0,onoir was an awesome dining experience i have ...,jlu4CztcSxrKx56ba1a5AQ,3.75,280,5,280,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
7,YXjD1_p0mvZkdI8S_RKdnA,2018-03-16 13:46:13,-li7zeP2cg4pocxH4iNTug,5.0,i grabbed a quick poke bowl today before my bu...,jlu4CztcSxrKx56ba1a5AQ,3.75,280,5,280,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
8,0MCXyrNQGaqdYBcAwh3anQ,2018-02-03 03:17:00,aujNf9GfrMExBNKDHDDbQg,4.0,for the price of the sushi it would be four st...,jlu4CztcSxrKx56ba1a5AQ,3.75,280,5,280,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903
9,LL7EY-YARSY41n98P9509w,2016-10-28 03:01:48,yNgwJLQRJSgF-LIUHf6cfQ,4.0,izakaya tsuki is located on the second round a...,jlu4CztcSxrKx56ba1a5AQ,3.75,280,5,280,...,0.03,0.09,0.05,0.20,0.02,0.06,6.78,3.84,7.24,903


In [86]:
raw_data.drop(["average_cool_y","average_funny_y","average_useful_y"],axis=1,inplace=True)

In [87]:
raw_data.to_csv("cleaned_data.csv",index=False)