In [1]:
#DATA CLEANING AND PREPROCESSING

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [3]:
#loading and understanding the dataset
data = pd.read_csv("TA_restaurants_curated.csv")

FileNotFoundError: File b'TA_restaurants_curated.csv' does not exist

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data["Price Range"].head()

In [None]:
Amsterdam = data[data.City == 'Amsterdam']
Amsterdam.head()

In [None]:
data['Cuisine Style'].loc[30]

In [None]:
city_count = data['City'].value_counts()
city_count.head()

In [None]:
#number of missing values in the dataset
data.isna().sum()

In [None]:
#renaming column 0
restaurant_data = data.rename(columns = {'Unnamed: 0':'Restaurant ID'})
restaurant_data.head()

In [None]:
#filling missing values in Ranking
restaurant_data["Ranking"] = restaurant_data["Restaurant ID"] + 1

In [None]:
restaurant_data.head()

In [None]:
restaurant_data.isna().sum()

In [None]:
na_loc = restaurant_data.index[restaurant_data['Number of Reviews'].isnull()]  #identifying the location of NANs

num_nas = len(na_loc)  #number of NANs

na_loc
num_nas

In [None]:
#filling missing values in Number of Reviews

#import random
#restaurant_data['Number of Reviews'].fillna(random.random())
restaurant_data["Number of Reviews"].fillna(restaurant_data['Number of Reviews'].mean(), inplace=True)
restaurant_data["Number of Reviews"] = restaurant_data['Number of Reviews'].round(0)

In [None]:
restaurant_data['Number of Reviews'].head()

In [None]:
restaurant_data.head()

In [None]:
#missing values in Price Range
restaurant_data.groupby('City')['Price Range'].value_counts()   

In [None]:
restaurant_data['Price Range'].fillna(restaurant_data['Price Range'].value_counts().index[0],inplace=True)

In [None]:
restaurant_data.isna().sum()

In [None]:
#Converting price range into numerical values
from sklearn.preprocessing import LabelEncoder

number = LabelEncoder()
restaurant_data['Price Range'] = number.fit_transform(restaurant_data['Price Range'].astype('str'))

In [None]:
restaurant_data['Price Range'].head(3)

In [None]:
restaurant_price = pd.get_dummies(restaurant_data['Price Range'])

In [None]:
type(restaurant_price)

In [None]:
restaurants = pd.concat([restaurant_data,restaurant_price], axis=1)

In [None]:
restaurants.head()

In [None]:
price_rename = restaurants.rename(columns = {0:'Price Range_$',1:'Price Range_$$-$$$',2:'Price range_$$$$'})

In [None]:
price_rename.head()

In [None]:
restaurants1 = price_rename.drop(['Price Range'], axis=1)
restaurants1.head()

In [None]:
restaurants1.isna().sum()

In [None]:
#drop missing review rows
#restaurants2 = restaurants1.dropna(subset=['Reviews'])

In [None]:
#missing review values
restaurants1['Reviews'] = restaurants1['Reviews'].fillna('["No Review"]', axis=0)

In [None]:
restaurants1.isna().sum()

In [None]:
restaurants1['Reviews'].tail()

In [None]:
restaurants1['Reviews'][3233]

In [None]:
restaurants1['Reviews'] = restaurants1['Reviews'].replace(['[[], []]'], 'No Review')

In [None]:
restaurants1['Reviews']

In [None]:
restaurants1['Reviews'][3233]

In [None]:
len(restaurants1)

In [None]:
"Vietnames" in restaurants1['Cuisine Style']

In [None]:
type(restaurants1['Cuisine Style'][175])

In [None]:
restaurants1['Cuisine Style'] = restaurants1['Cuisine Style'].fillna('["Unknown"]', axis=0)

In [None]:
restaurants1.tail()

In [None]:
restaurants1['Cuisine Style'][175]

In [None]:
restaurants1['Cuisine Style'][3243]

In [None]:
len(restaurants1['Cuisine Style'])

In [None]:
restaurants1.isna().sum()

In [None]:
restaurants1['Cuisine Style'][125189]

In [None]:
#vectorization (dealing with the missing cuisine style values)
cuisine_style = []

for i in range(len(restaurants1['Cuisine Style'])):
    word = restaurants1['Cuisine Style'].loc[i]
    j = word.replace("'","").replace("[","").replace("]","").replace("\"","")   #escape sequence("\"")
    j = j.replace(" ", "")
    print(j,i)
    y = j.split(',')
    cuisine_style.append(y)

cuisine_style

In [None]:
len(cuisine_style)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.DataFrame({'Cuisine Style':cuisine_style}, columns=['Cuisine Style'])
transform_df = df['Cuisine Style']
mlb = MultiLabelBinarizer()
cuisines = pd.DataFrame(mlb.fit_transform(transform_df),columns=mlb.classes_, index=df.index)

In [None]:
cuisines

In [None]:
cuisines.loc[0,:]

In [None]:
#concatenate the cuisines 
restaurants2 = pd.concat([restaurants1,cuisines], axis=1)
restaurants2

In [None]:
list(restaurants2)

In [None]:
#drop cuisine style column
restaurants3 = restaurants2.drop(['Cuisine Style'], axis=1)
restaurants3

In [None]:
#missing rating values
restaurants3.isna().sum()

In [None]:
restaurants3.Rating.value_counts()

In [None]:
restaurants3.Rating.unique()

In [None]:
restaurants3["Rating"].fillna(restaurants3['Rating'].mean(), inplace=True)

In [None]:
for col in ['Rating']:
    restaurants3[col] = restaurants3[col].astype(int)

In [None]:
restaurants3

In [None]:
restaurants3.Rating.value_counts()

In [None]:
restaurants3.Rating.unique()

In [None]:
#sentiments associated with ratings

#terrible = [0]

#bad = [1]

#average = [2]

#good = [3]

#very good = [4]

#excellent = [5]

In [None]:
#working with the text data in reviews - Sentiment Analysis

In [None]:
sentiment_data = restaurants3[['Rating', 'Reviews']]
sentiment_data

In [None]:
sdata = sentiment_data.values    #gives an numpy array
sdata                   

In [None]:
len(sdata)

In [None]:
count = len(sdata)
all_data = []

stop1 = int(0.25 * count)     #creating batches
stop2 = int(0.5 * count)
stop3 = int(0.75 * count)

In [None]:
stop1, stop2, stop3, count    #batches of data points

In [None]:
for i in range(count):
    rating = sdata[i, 0]
    reviews = sdata[i, 1].split('], [')[0]     #spliting the reviews and date strings from a single list and considering only the reviews
    reviews = reviews.replace("[[", "")
    reviews = reviews.replace("'", "")
    reviews = reviews.replace('"', '')
    reviews = reviews.split(',')
    print(reviews)
    for review in reviews:
        all_data.append([review, rating])

In [None]:
sent_data = pd.DataFrame(all_data, columns=['Review', 'Rating'])

In [None]:
sent_data

In [None]:
#remove the puntuation marks
sent_data['Review'] = sent_data['Review'].str.replace('[^\w\s]','')
sent_data.head(3)

In [None]:
#splitting sentences into list of words  
tokenized_data = sent_data['Review'].apply(lambda x : x.lower().split())    
tokenized_data.head(5)

In [None]:
import nltk      #natural language toolkit
from nltk.stem.porter import *

In [None]:
stemmer = PorterStemmer()
stem_data = tokenized_data.apply(lambda x: [stemmer.stem(i) for i in x])  # stemming

In [None]:
stem_data

In [None]:
stemmed_data = []
for i in range(len(stem_data)):
    stemmed_data.append(' '.join(stem_data[i]))

stemmed_data

In [None]:
np.array(stemmed_data).reshape(-1,1)

In [None]:
sent_data['Cleaned_Review'] = np.array(stemmed_data).reshape(-1,1)

In [None]:
sent_data

In [None]:
#splitting train/test data

textReviews = sent_data['Cleaned_Review']
Y = sent_data['Rating']

In [None]:
Y = np.array(Y)

In [None]:
for i in range(len(Y)):
    Y[Y < 0] = 0

In [None]:
Y

In [None]:
textReviews_train, textReviews_test, Y_train, Y_test = train_test_split(textReviews, Y, # data we want to split 
                                            train_size = 0.7,
                                            random_state = 500, # shuffle rows
                                            stratify = Y) # ensure classes the same in train/test

In [None]:
#change text data into numerical for the classifer to understand
#We will use a bag of words model. To break up our 'Customer reviews' sentences into words (tokens) with the Count Vectorizer.

In [None]:
#we call the sklearn count Vectorizer (transformer) to transform
#text into a vector
from sklearn.feature_extraction.text import CountVectorizer 
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=10000, stop_words='english') 

In [None]:
#use vetorizer var to fit on text
#bag-of-words feature matrix
X_train = bow_vectorizer.fit_transform(textReviews_train.values)
X_test = bow_vectorizer.fit_transform(textReviews_test.values)

In [None]:
print(X_train.shape)
print(X_train.toarray())

In [None]:
print(X_test.shape)
print(X_test.toarray())

In [None]:
# use vocabulary_ to see words in the vocabulary
vocabulary = bow_vectorizer.vocabulary_
vocabulary

In [None]:
#dimensionality reduction using truncated singular value decomposition (SVD)

from sklearn.decomposition import TruncatedSVD

tsvd = TruncatedSVD(n_components=500)  
x_train = tsvd.fit_transform(X_train)
x_test = tsvd.fit_transform(X_test)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train = pd.get_dummies(Y_train).values
y_train

In [None]:
y_test = pd.get_dummies(Y_test).values
y_test

In [None]:
#model training using keras

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
x_train.shape

In [None]:
#model architecture
model = Sequential([   
    Dense(729, activation = 'relu', input_shape = (x_train[1].shape)),
    Dense(389, activation="relu"),
    Dense(153, activation='relu'),
    Dense(6, activation = 'softmax')
])

In [None]:
model.summary()

In [None]:
adam = keras.optimizers.adam(lr = 0.01)

model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = adam,
              metrics = ['accuracy'])

In [None]:
history = model.fit(x_train, Y_train, epochs = 40, verbose = 1)

In [None]:
history = model.fit(x_train, Y_train, epochs = 40, batch_size = 1024, verbose = 1)