In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
df = pd.read_csv(filepath_or_buffer='Zomato Bangalore Data.csv',encoding='latin1')

In [4]:
df.shape

(19147, 14)

In [22]:
reviews=df[['reviews_list','rate']]

In [23]:
reviews=reviews[reviews.reviews_list != '[]']# removed all rows with blank comment

In [24]:
reviews= reviews.drop_duplicates()

In [25]:
reviews.shape

(10995, 2)

In [26]:
# separating 4.2/5/0 to 4.2 and 5.0 an accepting only first split 
reviews['rate']=reviews['rate'].str.split("/",n=1,expand=True)[0]

In [27]:
reviews.shape

(10995, 2)

In [28]:
import string
import re
import emoji

In [29]:
pattern1=r"(Rated\s+\d.\d.\W)|(RATED)|(\\n)|(\d/\d)"
punc = string.punctuation
punctuation = string.punctuation
# punc (result : '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
#all the puncation which i dont require as of now ( if required )
punc = punc.replace(",", "")
punc = punc.replace("'", "")
punc = punc.replace(".", "")
remove_punc=r'[{}]'.format(re.escape(punc))
ph = r"['\x83ã\x83ã\x82ã\x83ã\x83ã\x82ã\x82ã\x83ã\x83ã\x83ã\x82ã\x82ã\x83ã\x82ã\x82ã¢ã\x83ã\x83ã\x82ã\x83ã\x83ã\x82ã\x82ã\x82ã\x83ã\x83ã\x82ã\x82ã\x83ã\x82ã\x82ã\x80ã\x83ã\x83ã\x82ã\x83ã\x83ã\x82ã\x82ã\x82ã\x83ã\x83ã\x82ã\x82ã\x83ã\x82ã\x82ã']|[â]"
remove_digit = r'[{}]'.format(string.digits)
pattern2 =r"(f\s\W\s\W)+"
pattern3 =r"(\s°\sf\s\w)|(\s¥\s\¯\s)" 
pattern4 =r"(\s\sf\sf\s¼)|(°\sf\s§)|(\s\s°)"

In [18]:
 text_reviews = []
def clean_reviews(sentences):
    text= (re.sub(pattern1,"",sentences))  #removing rated 3.0 or rated n
    text = text.lower()
    text= re.sub(remove_punc," ",text)
    text= emoji.replace_emoji(text,"")
    text=re.sub(r"\s{2}","",text)
    text= re.sub(ph,"  ",text)
    text=re.sub(remove_digit," ",text)
    text= re.sub(r'x\s{1,3}',"",text)
    text=re.sub(pattern2," ",text)
    text=re.sub(pattern3," ",text)
    text=re.sub(pattern4," ",text)
    #punctuation=string.punctuation
    text = re.sub(r"[',.]"," ",text)
    text=re.sub(r'\s{2,}'," ",text)
    return (text)
    ### Task -3  : Change all words into lower case
    #text_reviews.append(text)

In [19]:
reviews['clean_text'] = pd.Series()

  reviews['clean_text'] = pd.Series()


In [30]:
reviews['clean_text']= reviews['reviews_list'].apply(lambda x:clean_reviews(x))#clean_reviews(reviews.reviews_list)

In [34]:
reviews.drop_duplicates(subset=('clean_text'),inplace=True)

In [41]:
def rate_extractor2(rate):
    patterns= r"Rated\s\d"
    text = re.findall(patterns,rate)
    if len(text)==0:
        text['0.0']
    else:
        text= [phrase.split(' ')[1] for phrase in text]
        return round(np.mean(list(map(int, text))),2)

In [42]:
reviews['ExtractedRate']=reviews['reviews_list'].apply(lambda x:rate_extractor2(x))

In [46]:
reviews['rate_cat'] = pd.cut(reviews['ExtractedRate'], bins=[0,3,4.0,5],
                    labels=['Below Average', 'Average','Highly Recommended'
                            ]).astype(str)

In [48]:
reviews.rate_cat.value_counts()

Average               4891
Highly Recommended    2770
Below Average         1467
Name: rate_cat, dtype: int64

In [49]:
spacy_stopwords= nlp.Defaults.stop_words

In [56]:
reviews['text_no_stopwords'] = reviews['clean_text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in spacy_stopwords))

In [53]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [54]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [59]:
lemmatizer =WordNetLemmatizer()

In [60]:
#lemmitization_output=[]
def lemmitizer(text):
    tokens_words = nltk.word_tokenize(text)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in tokens_words])
    return (lemmatized_output)

In [62]:
reviews['lemmatizer']=reviews['text_no_stopwords'].apply(lambda x : lemmitizer(x))

In [64]:
#creating features

In [65]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

In [75]:
# create object
tfidf = TfidfVectorizer(ngram_range=(2,2),min_df=0.1,max_df=0.95)
# get tf-df values
result = tfidf.fit_transform(reviews['lemmatizer'])

In [76]:
matrix = pd.DataFrame(result.toarray(),columns=tfidf.get_feature_names())



In [77]:
matrix.head(5)

Unnamed: 0,amazing food,ambience food,ambience good,ambience place,ambience service,best place,chicken biryani,coming food,decent place,didn like,...,taste bud,taste food,taste good,tasted good,try place,value money,visit place,visited place,worth money,zomato gold
0,0.0,0.0,0.447159,0.0,0.0,0.226634,0.0,0.0,0.0,0.0,...,0.0,0.0,0.206071,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.094161,0.027652,0.046458,0.0,0.0,0.0,0.0,0.029082,0.0,0.0,...,0.031346,0.030238,0.25692,0.0,0.082997,0.021542,0.0,0.0,0.031408,0.03034
2,0.201414,0.044361,0.074532,0.0,0.0,0.0,0.0,0.046656,0.0,0.0,...,0.050287,0.048511,0.171738,0.0,0.177534,0.034559,0.0,0.0,0.050387,0.048673
3,0.0,0.205101,0.258444,0.0,0.0,0.130987,0.0,0.161784,0.055404,0.0,...,0.174375,0.336429,0.0,0.0,0.0,0.0,0.0,0.042568,0.05824,0.056259
4,0.110371,0.097236,0.163367,0.0,0.0,0.0,0.0,0.102266,0.0,0.0,...,0.110226,0.212662,0.075287,0.0,0.097285,0.0,0.0,0.0,0.110444,0.106688


In [78]:
from sklearn.model_selection import train_test_split

In [79]:
y=reviews['rate_cat']

In [130]:
y.shape

(9128,)

In [131]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
X_train_new, y_train_new = sm.fit_sample(matrix, y)

ModuleNotFoundError: No module named 'imblearn'

In [81]:
print(f"msg_train\t{len(X_train)}\nmsg_test\t{len(X_test)}\nlabel_train\t{len(y_train)}\nlabel_test\t{len(y_test)}")

msg_train	5476
msg_test	3652
label_train	5476
label_test	3652


In [83]:
#Modeling

In [84]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

In [125]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [127]:
classifier1 = RandomForestClassifier(criterion='gini',random_state=42)

In [107]:
classifier2 = SVC(kernel="poly").fit(X_train,y_train)

In [103]:
classifier3 = MultinomialNB().fit(X_train,y_train)

In [120]:
predictions1 = classifier1.predict(X_test)

In [108]:
predictions2 = classifier2.predict(X_test)

In [104]:
predictions3 = classifier3.predict(X_test)

In [109]:
#metrics
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix

In [121]:
# Print the overall accuracy
print(f"The project accuracy is {round(metrics.accuracy_score(y_test,predictions1),2)*100} %")

The project accuracy is 68.0 %


In [122]:
print(classification_report(y_test,predictions))

                    precision    recall  f1-score   support

           Average       0.65      0.87      0.75      1951
     Below Average       0.63      0.29      0.40       603
Highly Recommended       0.76      0.53      0.62      1098

          accuracy                           0.67      3652
         macro avg       0.68      0.57      0.59      3652
      weighted avg       0.68      0.67      0.65      3652



In [63]:
reviews.head(5)

Unnamed: 0,reviews_list,rate,clean_text,ExtractedRate,rate_cat,text_no_stopwords,lemmatizer
0,"[('Rated 4.0', 'RATED\n Excellent food I had ...",4.0,excellent food i had a spicy corn soup and sh...,4.17,Highly Recommended,excellent food spicy corn soup sholey ke panee...,excellent food spicy corn soup sholey ke panee...
2,"[('Rated 5.0', ""RATED\n After roaming around ...",4.1,after roaming around neighborhood we landed h...,3.39,Average,roaming neighborhood landed lunch spacious res...,roaming neighborhood landed lunch spacious res...
3,"[('Rated 5.0', ""RATED\n After roaming around ...",4.1,after roaming around neighborhood we landed h...,3.54,Average,roaming neighborhood landed lunch spacious res...,roaming neighborhood landed lunch spacious res...
4,"[('Rated 4.0', ""RATED\n Been here twice now a...",4.1,been here twice now and i love this place the...,3.88,Average,twice love place ambiance good food tastes gre...,twice love place ambiance good food taste grea...
5,"[('Rated 4.0', 'RATED\n This is a pure vegeta...",4.1,this is a pure vegetarian restaurant located ...,3.55,Average,pure vegetarian restaurant located th floor pr...,pure vegetarian restaurant located th floor pr...


In [None]:
final_ratings = df['reviews_list'].apply(lambda x:rate_extractor(x))

In [None]:
df2=pd.DataFrame(final_ratings)
df2.shape

In [None]:
data_merge1 = pd.merge(df1,reviews['rate'],how="left",left_index = True,right_index = True) # left join based on index                     

In [None]:
data_merge1 = pd.merge(data_merge1,df2,how="left",left_index = True,right_index = True)

In [None]:
data_merge1 = pd.merge(data_merge1,df3,how="left",left_index = True,right_index = True)

In [None]:
data_merge1.rename(columns={0:'CleanReviews',"reviews_list":"ExtractedRate"},inplace=True)

In [None]:
data_merge1

In [None]:
#plt.figure(figsize=(12,6))
#sns.distplot(data_merge1['ExtractedRate'],color='blue')

In [None]:
 #udne consideration
#sns.heatmap(data_merge1.isnull())

In [None]:
data_merge1['rate_cat'] = pd.cut(x=data_merge1['ExtractedRate'], bins=[0,3,4.0,5],
                    labels=['Below Average', 'Average','Highly Recommended'
                            ]).astype(str)

In [None]:
rate_cat.value_counts()

In [None]:
data_merge1[data_merge1['rate_cat']=='nan']

In [None]:
data_merge1.describe()

In [None]:
#leave te last code its just for my reference 

In [None]:
#function for extracting rate from reviews 
def rate_extractor(review):
    patterns=r"Rated (\d)"
    text=re.findall(patterns,review)
    if len(text)==0:
        text=['0.0']
    return round(np.mean(list(map(float,text))),2)

In [None]:
rating_from_reviews =[]
def rate_extractor2(rate):
    for i, in :
        patterns= r"Rated\s\d"
        text = re.findall(patterns,sentences)
        if len(text)==0:
            rating_from_reviews.append(round((sum(text)/(len(text)+1)),1))
        else:
            text= [phrase.split(' ')[1] for phrase in text]
            text = list(map(int, text))
            rating_from_reviews.append(round((sum(text)/len(text)),1))