# 2.- Text Preprocessing using NLP and Calculate Sentiment Analysis 

In [2]:
import pandas as pd
import numpy as np

# Text processing
import spacy
import spacy_lookups_data
import re

# Sentiment Analysis
from textblob import TextBlob

# Scaler
from sklearn.preprocessing import MinMaxScaler

In [3]:
reviews = pd.read_csv('../data/yelp_reviews_restaurant.csv')
reviews.head()

Unnamed: 0,user_id,business_id,review_stars,text,date,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories
0,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11,Deagan's Kitchen & Bar,14810 Detroit Ave,Lakewood,OH,44107,41.485192,-81.800145,4.0,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...",American
1,zFCuveEe6M-ijY1iy23IJg,HQl28KMwrEKHqhFrrDqVNQ,5,"We walked into Melt. ""Did you want to put your...",2011-08-25 04:24:23,Deagan's Kitchen & Bar,14810 Detroit Ave,Lakewood,OH,44107,41.485192,-81.800145,4.0,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...",American
2,4V985R3RG-rv0B7WCPQzeQ,HQl28KMwrEKHqhFrrDqVNQ,1,I commented on how slow the service was last A...,2015-03-04 20:37:43,Deagan's Kitchen & Bar,14810 Detroit Ave,Lakewood,OH,44107,41.485192,-81.800145,4.0,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...",American
3,nFGcoL6wuPQzxsNJVSfGrA,HQl28KMwrEKHqhFrrDqVNQ,4,We walked in off the streets on a September ni...,2014-09-10 01:38:55,Deagan's Kitchen & Bar,14810 Detroit Ave,Lakewood,OH,44107,41.485192,-81.800145,4.0,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...",American
4,CJqgUQeWhdgbDyLAFy7xvQ,HQl28KMwrEKHqhFrrDqVNQ,4,Brunch on Saturday was excellent. The Bloody M...,2018-01-21 18:50:29,Deagan's Kitchen & Bar,14810 Detroit Ave,Lakewood,OH,44107,41.485192,-81.800145,4.0,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...",American


In [4]:
# Remove users that have more than 50 reviews.
grouped_users = reviews.groupby('user_id')['text'].count().reset_index()
grouped_users.head()

Unnamed: 0,user_id,text
0,---1lKK3aKOuomHnwAkAow,27
1,---94vtJ_5o_nikEs6hUjg,1
2,---RfKzBwQ8t3wu-LXvx3w,1
3,---tGbMnMitD_7srW6Nfzg,1
4,---udAKDsn0yQXmzbWQNSw,1


In [5]:
result = grouped_users.query('text > 50')
result['text'].sum()

354596

In [6]:
reviews.shape[0]

3140427

In [7]:
reviews_filtered = reviews[['user_id', 'name', 'review_stars', 'text']][reviews['user_id'].isin(result['user_id'])]
reviews_filtered.drop_duplicates(inplace=True)
reviews_filtered.shape

(320579, 4)

## Text preprocessing

In [9]:
# Functions to preprocess text

# convert to lower cases
def to_lower(text):
    return text.lower()

# remove some special characters
def remove_special_chars(text):
    return ''.join(re.sub(r'[-/@"():;^_%&,.!?]', ' ', text))

# remove white spaces
def remove_spaces(text):
    return re.sub(r'\s+',' ', text)

In [10]:
%%time
def clean_text(df):
    df['cleaned_text'] = df['text'].map(to_lower).map(remove_special_chars).map(remove_spaces)
    return df

df = clean_text(reviews_filtered)
df.head()

Wall time: 20.8 s


Unnamed: 0,user_id,name,review_stars,text,cleaned_text
1,zFCuveEe6M-ijY1iy23IJg,Deagan's Kitchen & Bar,5,"We walked into Melt. ""Did you want to put your...",we walked into melt did you want to put your n...
4,CJqgUQeWhdgbDyLAFy7xvQ,Deagan's Kitchen & Bar,4,Brunch on Saturday was excellent. The Bloody M...,brunch on saturday was excellent the bloody ma...
21,alxetHC3mXR2PtG8CeCN6Q,Deagan's Kitchen & Bar,4,"Great food, great atmosphere, great service. s...",great food great atmosphere great service some...
27,kHGRDaZt4nHgNSJcjOTDeQ,Deagan's Kitchen & Bar,3,Had a Saturday evening dinner with friends. G...,had a saturday evening dinner with friends goi...
29,pg2soiiAWSvenWac41oG8Q,Deagan's Kitchen & Bar,2,I haven't been here for years. I'm not from ...,i haven't been here for years i'm not from thi...


## Sentiment Analysis

Sentiment analysis is a text analysis method that detects polarity (e.g. a positive or negative opinion) within text, whether a whole document, paragraph, sentence, or clause.

Understanding people’s emotions is essential for businesses since customers are able to express their thoughts and feelings more openly than ever before.

In [14]:
# Calculate sentiment for each review

def sentiment_calc(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

df['sentiment'] = df['cleaned_text'].apply(sentiment_calc)

In [15]:
df.reset_index().drop('index', axis=1, inplace=True)

In [16]:
# Check if there are some null values
df.isnull().sum()

user_id         0
name            0
review_stars    0
text            0
cleaned_text    0
sentiment       0
dtype: int64

In [22]:
## Scale sentiment to be comparable with rating
scaler = MinMaxScaler(feature_range=(1,5))

df['sent_rating'] = scaler.fit_transform(df['sentiment'].values.reshape(-1,1))

df.drop('sentiment', axis=1, inplace=True)

In [23]:
df.head()

Unnamed: 0,user_id,name,review_stars,text,cleaned_text,sent_rating
1,zFCuveEe6M-ijY1iy23IJg,Deagan's Kitchen & Bar,5,"We walked into Melt. ""Did you want to put your...",we walked into melt did you want to put your n...,3.505159
4,CJqgUQeWhdgbDyLAFy7xvQ,Deagan's Kitchen & Bar,4,Brunch on Saturday was excellent. The Bloody M...,brunch on saturday was excellent the bloody ma...,3.8
21,alxetHC3mXR2PtG8CeCN6Q,Deagan's Kitchen & Bar,4,"Great food, great atmosphere, great service. s...",great food great atmosphere great service some...,3.5875
27,kHGRDaZt4nHgNSJcjOTDeQ,Deagan's Kitchen & Bar,3,Had a Saturday evening dinner with friends. G...,had a saturday evening dinner with friends goi...,3.436378
29,pg2soiiAWSvenWac41oG8Q,Deagan's Kitchen & Bar,2,I haven't been here for years. I'm not from ...,i haven't been here for years i'm not from thi...,2.981408


In [25]:
# Map user_id with index

df['user_id'].unique(), df['user_id'].nunique()

array(['zFCuveEe6M-ijY1iy23IJg', 'CJqgUQeWhdgbDyLAFy7xvQ',
       'alxetHC3mXR2PtG8CeCN6Q', ..., 'BIcfTJw4-76b9NuZUgpKIA',
       'ZYY9f01cj8dB9sMfuY3diA', 'SPI7O1GKZTvJhiXNFj81Eg'], dtype=object)

In [35]:
# Map user_id with index

for i, user_id in enumerate(df['user_id'].unique()):
    idx = df[df['user_id'] == user_id].index
    df.loc[idx,'uid'] = int(i)
    
df['uid'].nunique()

4006

In [40]:
df['uid'] = df['uid'].astype('int16')

In [42]:
df.drop(['user_id', 'text'], axis=1, inplace=True)

In [43]:
df.head()

Unnamed: 0,name,review_stars,cleaned_text,sent_rating,uid
1,Deagan's Kitchen & Bar,5,we walked into melt did you want to put your n...,3.505159,0
4,Deagan's Kitchen & Bar,4,brunch on saturday was excellent the bloody ma...,3.8,1
21,Deagan's Kitchen & Bar,4,great food great atmosphere great service some...,3.5875,2
27,Deagan's Kitchen & Bar,3,had a saturday evening dinner with friends goi...,3.436378,3
29,Deagan's Kitchen & Bar,2,i haven't been here for years i'm not from thi...,2.981408,4


In [44]:
df.to_csv('../data/reviews_filtered.csv', index=False)