In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('Restaurant reviews.csv')
df.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,


# Data Cleaning

In [3]:
df = df.drop(["Restaurant", "Reviewer", "Metadata", "Pictures" ,"7514"], axis=1)

In [4]:
df["Time"] = list(map(lambda data: str(data).split()[0], df["Time"]))
df["Time"] = list(map(lambda data: str(data).split("/")[-1], df["Time"]))

In [5]:
df["Rating"].unique()

array(['5', '4', '1', '3', '2', '3.5', '4.5', '2.5', '1.5', 'Like', nan],
      dtype=object)

In [6]:
df["Rating"].isnull().sum()

38

In [7]:
df['Rating'].value_counts().idxmax()

'5'

In [8]:
df['Rating'] = np.where(df["Rating"] == "Like", df['Rating'].value_counts().idxmax(), df['Rating'])

In [9]:
df["Rating"].unique()

array(['5', '4', '1', '3', '2', '3.5', '4.5', '2.5', '1.5', nan],
      dtype=object)

In [10]:
df["Rating"] = list(map(lambda data: float(data) >= 3, df["Rating"]))

In [11]:
df["Rating"].unique()

array([ True, False])

In [12]:
df["Time"].unique()

array(['2019', '2018', '2017', '2016', 'nan'], dtype=object)

In [13]:
df['Time'] = np.where(df["Time"] == "nan", df['Time'].value_counts().idxmax(), df['Time'])

In [14]:
df["Time"].unique()

array(['2019', '2018', '2017', '2016'], dtype=object)

In [15]:
mmsTime = MinMaxScaler()

mmsTime.fit(df[["Time"]])
df["Time"] = mmsTime.transform(df[["Time"]])

In [16]:
df["Time"].unique()

array([1.        , 0.66666667, 0.33333333, 0.        ])

In [17]:
df

Unnamed: 0,Review,Rating,Time
0,"The ambience was good, food was quite good . h...",True,1.0
1,Ambience is too good for a pleasant evening. S...,True,1.0
2,A must try.. great food great ambience. Thnx f...,True,1.0
3,Soumen das and Arun was a great guy. Only beca...,True,1.0
4,Food is good.we ordered Kodi drumsticks and ba...,True,1.0
...,...,...,...
9995,Madhumathi Mahajan Well to start with nice cou...,True,0.0
9996,This place has never disappointed us.. The foo...,True,0.0
9997,"Bad rating is mainly because of ""Chicken Bone ...",False,0.0
9998,I personally love and prefer Chinese Food. Had...,True,0.0


In [18]:
df["Review"] = df["Review"].fillna("Nothing")

# NLP

In [21]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

ps = PorterStemmer()

In [27]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Donation\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Donation\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [28]:
reviewList = list(map(lambda data: word_tokenize(data), df["Review"]))

In [29]:
for index, word_list in enumerate(reviewList):
    reviewList[index] = " ".join([ps.stem(str(word)) for word in word_list if not word in stopwords.words("english") and word not in [".", ",", "?", "@", "$", "/"] and not word.isspace()])

In [30]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF" 
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)

regexPuct = r"[\s\w\d]"

In [31]:
for index, word_list in enumerate(reviewList):
    reviewList[index] = re.sub(emoji_pattern, r'', word_list)
    reviewList[index] = "".join(re.findall(regexPuct, reviewList[index], re.MULTILINE))
    reviewList[index] = " ".join(reviewList[index].split())

In [32]:
df["Review"] = reviewList

In [33]:
df.head()

Unnamed: 0,Review,Rating,Time
0,the ambienc good food quit good saturday lunch...,True,1.0
1,ambienc good pleasant even servic prompt food ...,True,1.0
2,a must tri great food great ambienc thnx servi...,True,1.0
3,soumen da arun great guy onli behavior sincere...,True,1.0
4,food goodw order kodi drumstick basket mutton ...,True,1.0


# Separating Data

In [34]:
X = df["Review"]
y = df["Rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
X_train.iloc[0]

'went dinner buffet famili gather food averag price paymost item avail buffet menu serv tabl lot delay'

# Models

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3))),
    ('classifier', SVC()) 
])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.9145

# Testing Model

In [40]:
positive_sentence = "I absolutely love this product! It has exceeded all my expectations. It's well-designed, durable, and incredibly useful. I can't imagine my life without it now. I would highly recommend it to anyone looking for a high-quality, reliable product"
negative_sentence = "I bought this product, and it was a complete waste of money. It stopped working after just a week, and the customer service was unhelpful and rude. I will never purchase anything from this company again."

In [41]:
pipeline.predict([positive_sentence, negative_sentence])

array([ True, False])

In [44]:
def preprocess(text):
    text = word_tokenize(text)
    text = " ".join([ps.stem(str(word)) for word in text if not word in stopwords.words("english") and word not in [".", ",", "?", "@", "$", "/"] and not word.isspace()])
    text = re.sub(emoji_pattern, r'', text)
    text = "".join(re.findall(regexPuct, text, re.MULTILINE))
    text = " ".join(text.split())

    return text

In [45]:
positive_sentence = preprocess(positive_sentence)
negative_sentence = preprocess(negative_sentence)

In [46]:
positive_sentence

'i absolut love product it exceed expect it s welldesign durabl incred use i ca nt imagin life without i would highli recommend anyon look highqual reliabl product'

In [47]:
negative_sentence

'i bought product complet wast money it stop work week custom servic unhelp rude i never purchas anyth compani'

In [48]:
pipeline.predict([positive_sentence, negative_sentence])

array([ True, False])