# Libraries

In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from nltk import NaiveBayesClassifier

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from numpy.random import random_sample
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import category_encoders as ce

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter, defaultdict


In [2]:
ps = PorterStemmer()

In [3]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1123)>


False

# Open CSV

In [4]:
df = pd.read_csv('/Users/lilianvalin/Library/Mobile Documents/com~apple~CloudDocs/iCloud Drive/Cours/UTC/S5 - A24/Voice Recognition & NLP/Lab/Lab 2/European_Restaurant_Reviews.csv')

In [5]:
df.head()

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review
0,France,The Frog at Bercy Village,Negative,Rude manager,May 2024 •,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,Negative,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1502 entries, 0 to 1501
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          1502 non-null   object
 1   Restaurant Name  1502 non-null   object
 2   Sentiment        1502 non-null   object
 3   Review Title     1502 non-null   object
 4   Review Date      1502 non-null   object
 5   Review           1502 non-null   object
dtypes: object(6)
memory usage: 70.5+ KB


# Cleaning text

## Remove punctuation + number

In [7]:
review = re.sub('[^a-zA-Z]', ' ', df['Review'][0])

In [8]:
review

'The manager became agressive when I said the carbonara was not good  She was rude  It was  a very bad experience  The worst in Rome in many years '

## Transform to lowercase

In [9]:
review = review.lower()

In [10]:
review

'the manager became agressive when i said the carbonara was not good  she was rude  it was  a very bad experience  the worst in rome in many years '

## Stopwords

In [11]:
review = review.split()

In [12]:
review

['the',
 'manager',
 'became',
 'agressive',
 'when',
 'i',
 'said',
 'the',
 'carbonara',
 'was',
 'not',
 'good',
 'she',
 'was',
 'rude',
 'it',
 'was',
 'a',
 'very',
 'bad',
 'experience',
 'the',
 'worst',
 'in',
 'rome',
 'in',
 'many',
 'years']

# Remove Stopwords + Stem

In [13]:
review = [ps.stem(word) for word in review if not word in stopwords.words ('english')]

In [14]:
review

['manag',
 'becam',
 'agress',
 'said',
 'carbonara',
 'good',
 'rude',
 'bad',
 'experi',
 'worst',
 'rome',
 'mani',
 'year']

## Join

In [15]:
review = ' '.join(review)

In [16]:
review

'manag becam agress said carbonara good rude bad experi worst rome mani year'

In [17]:
corpus = []
for i in range(0, 1502):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words ('english')]
    review = ' '.join(review)
    corpus.append(review)

In [18]:
corpus

['manag becam agress said carbonara good rude bad experi worst rome mani year',
 'order beef fillet ask done medium got well done cook dri told took steak minut brought anoth steak complet rare left steak ask charg fool price',
 'attract venu welcom albeit somewhat slow servic offer pleasingli present everyth tast though ingredi assembl without season love restaur except food',
 'sadli use high tripadvisor rate liter fortun eat number fine restaur know amaz buzz mouth fine freshli cook food great chef never eaten tast menu never ad hoc great ambienc attent friendli servic pair wine except mayb hit night chef everi singl cours food certainli freshli cook pass kitchen way bathroom notic plate precook food readi go tast menu plate odd dish disappoint would accept might palat like everi plate food hot bland lack spark probabl sit around l serv staff great wine fail finish plate offer tast menu look good valu simpl bowl hot freshli cook pasta would done better justic wine ask admit great en

# X & y

In [19]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,2].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [21]:
print(X)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['Negative' 'Negative' 'Negative' ... 'Negative' 'Negative' 'Negative']


# Naive Bayes

In [22]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [23]:
accuracy_score(y_test, y_pred)

0.8137472283813747

## One Hot Encoder

In [24]:
df['Sentiment'].describe()

count         1502
unique           2
top       Positive
freq          1237
Name: Sentiment, dtype: object

In [25]:
df['Sentiment'].value_counts()

Sentiment
Positive    1237
Negative     265
Name: count, dtype: int64

In [26]:
encoder = ce.OneHotEncoder(cols=["Sentiment"], use_cat_names=True)

In [27]:
train, Test = train_test_split(df, random_state=42)

In [28]:
train_encoded = encoder.fit_transform(train)
test_encoded = encoder.transform(Test)

In [29]:
test_encoded

Unnamed: 0,Country,Restaurant Name,Sentiment_Positive,Sentiment_Negative,Review Title,Review Date,Review
1118,Russia,Pelmenya,1,0,Very good mono choice,Aug 2015 •,"Relaxing, cozy. A waitress reserved a table fo..."
643,Italy,Ad Hoc Ristorante (Piazza del Popolo),1,0,Lovely dinner wonderful atmosphere!,Aug 2019 •,We had a wonderful dinner. They gave us a deli...
422,France,The Frog at Bercy Village,1,0,Great location,Feb 2016 •,The Frog in Bercy village is a reasonably pric...
413,France,The Frog at Bercy Village,1,0,Outstanding,Aug 2016 •,It was my second time in this restaurant and I...
451,France,The Frog at Bercy Village,1,0,The beer place,Jul 2015 •,Home brewed beer don't disappoint. They are al...
...,...,...,...,...,...,...,...
1304,Morroco,The LOFT,1,0,"Informal, fabulous food",Apr 2019 •,Spotted The Loft on TripAdvisor so decided to ...
1246,Morroco,The LOFT,1,0,Best meal in Essaouira!,Nov 2019 •,We ate here twice in a week and the food was d...
1464,Cuba,Old Square (Plaza Vieja),1,0,Beautiful City Square Surrounded by Classic Ar...,May 2014 •,This square is inside the pedestrian zone so n...
1058,Russia,Pelmenya,1,0,Leisure,Jun 2016 •,Tasty vareniky and pelmeni! Good location to F...


# TF-IDF

In [30]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)

tfidf_transformer.fit(X)

df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(), columns=["idf_weight"])

df_idf.sort_values(by=["idf_weight"])

Unnamed: 0,idf_weight
food,1.650809
servic,1.909990
good,2.122856
restaur,2.145608
great,2.151904
...,...
essauira,7.622071
porter,7.622071
essaourai,7.622071
popadom,7.622071


In [48]:
X = df.iloc[:, 1].values
y = df.iloc[:, 2].values

In [49]:
td = TfidfVectorizer(max_features=4500)
X = td.fit_transform(X).toarray()

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [54]:
mnb = MultinomialNB()
y_pred_mnb = mnb.fit(X_train, y_train).predict(X_test)

In [55]:
accuracy_score(y_test, y_pred_mnb)

0.8248337028824834

In [65]:
rdf = RandomForestClassifier(max_depth=2, random_state=10)
y_pred_rdf = rdf.fit(X_train, y_train).predict(X_test)

In [66]:
accuracy_score(y_test, y_pred_rdf)

0.8248337028824834

In [67]:
nbc = GaussianNB()
y_pred_rdf = nbc.fit(X_train, y_train).predict(X_test)

In [68]:
accuracy_score(y_test, y_pred_rdf)

0.4124168514412417

## BPE (Byte Pair Encoding)

In [94]:
def get_vocab(text):
    vocab=Counter(text.split())
    
    return {' '.join(word): freq for word, freq in vocab.items()}

In [71]:
def get_stats(vocab):
    pairs=defaultdict(int)
    for word, freq in vocab.items():
        symbols=word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]]+=freq
    return pairs

In [73]:
def merge_vocab(pair, vocab):
    new_vocab={}
    bigram=' '.join(pair)
    replacement=''.join(pair)
    for word in vocab:
        new_word=word.replace(bigram, replacement)
        new_vocab[new_word]=vocab[word]
    return new_vocab

In [74]:
text = 'low lower newest widest'

In [79]:
vocab = get_vocab(text)
vocab = {' '.join(word): freq for word, freq in vocab.items()}

In [81]:
vocab

{'l o w': 1, 'l o w e r': 1, 'n e w e s t': 1, 'w i d e s t': 1}

In [82]:
num_merge = 10

In [125]:
def merge(vocab, num_merge = 2):
    print(vocab)
    for i in range(num_merge):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best_pairs=max(pairs, key=pairs.get)
        vocab=merge_vocab(best_pairs, vocab)
        #print(f'After it {i+1}, Best pairs: {best_pairs}')
        #print("Update vocab: ", vocab )

In [126]:
df['Review']

0       The manager became agressive when I said the c...
1       I ordered a beef fillet ask to be done medium,...
2       This is an attractive venue with welcoming, al...
3       Sadly I  used the high TripAdvisor rating too ...
4       From the start this meal was bad- especially g...
                              ...                        
1497    Despite the other reviews saying that this is ...
1498    beer is good.  food is awfull  The only decent...
1499    for terrible service of a truly comedic level,...
1500    We visited the Havana's Club Museum which is l...
1501    Food and service was awful. Very pretty stop. ...
Name: Review, Length: 1502, dtype: object

In [127]:
df['vocab_freq'] = df['Review'].apply(get_vocab)

In [128]:
df['vocab_freq']

0       {'T h e': 2, 'm a n a g e r': 1, 'b e c a m e'...
1       {'I': 4, 'o r d e r e d': 1, 'a': 1, 'b e e f'...
2       {'T h i s': 1, 'i s': 1, 'a n': 1, 'a t t r a ...
3       {'S a d l y': 1, 'I': 6, 'u s e d': 1, 't h e'...
4       {'F r o m': 1, 't h e': 11, 's t a r t': 1, 't...
                              ...                        
1497    {'D e s p i t e': 2, 't h e': 7, 'o t h e r': ...
1498    {'b e e r': 1, 'i s': 2, 'g o o d .': 1, 'f o ...
1499    {'f o r': 1, 't e r r i b l e': 1, 's e r v i ...
1500    {'W e': 1, 'v i s i t e d': 1, 't h e': 2, 'H ...
1501    {'F o o d': 1, 'a n d': 1, 's e r v i c e': 1,...
Name: vocab_freq, Length: 1502, dtype: object

In [129]:
df['vocab_freq_m'] = df['vocab_freq'].apply(merge)

{'T h e': 2, 'm a n a g e r': 1, 'b e c a m e': 1, 'a g r e s s i v e': 1, 'w h e n': 1, 'I': 1, 's a i d': 1, 't h e': 1, 'c a r b o n a r a': 1, 'w a s': 3, 'n o t': 1, 'g o o d .': 1, 'S h e': 1, 'r u d e .': 1, 'I t': 1, 'a': 1, 'v e r y': 1, 'b a d': 1, 'e x p e r i e n c e .': 1, 'w o r s t': 1, 'i n': 2, 'R o m e': 1, 'm a n y': 1, 'y e a r s .': 1}
{'I': 4, 'o r d e r e d': 1, 'a': 1, 'b e e f': 1, 'f i l l e t': 1, 'a s k': 1, 't o': 1, 'b e': 1, 'd o n e': 2, 'm e d i u m ,': 1, 'g o t': 1, 'i t': 2, 'w e l l': 1, 'c o o k e d': 1, 'a n d': 3, 'v e r y': 1, 'd r y ,': 1, 'w h e n': 1, 't o l d': 1, 't h e m': 1, 't h e y': 3, 't o o k': 1, 't h e': 3, 's t e a k': 3, 'a f t e r': 1, '2 0': 1, 'm i n u t e s': 1, 'b r o u g h t': 1, 'm e': 3, 'a n o t h e r': 1, 't h a t': 1, 'w a s': 1, 'c o m p l e t e l y': 1, 'r a r e ,': 1, 'l e f t': 1, 'a l l': 1, ',': 1, 'a s k e d': 1, 'a b o u t': 1, 'b u t': 1, 'c h a r g e d': 1, 'f o o l': 1, 'p r i c e .': 1}
{'T h i s': 1, 'i s'

In [130]:
df.head()

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review,vocab_freq,merged_vocab,vocab_freq_m
0,France,The Frog at Bercy Village,Negative,Rude manager,May 2024 •,The manager became agressive when I said the c...,"{'T h e': 2, 'm a n a g e r': 1, 'b e c a m e'...","{'The': 2, 'manager': 1, 'became': 1, 'agressi...",
1,France,The Frog at Bercy Village,Negative,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,...","{'I': 4, 'o r d e r e d': 1, 'a': 1, 'b e e f'...","{'I': 4, 'ordered': 1, 'a': 1, 'beef': 1, 'fil...",
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al...","{'T h i s': 1, 'i s': 1, 'a n': 1, 'a t t r a ...","{'This': 1, 'is': 1, 'an': 1, 'attractive': 1,...",
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...,"{'S a d l y': 1, 'I': 6, 'u s e d': 1, 't h e'...","{'Sadly': 1, 'I': 6, 'used': 1, 'the': 13, 'hi...",
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...,"{'F r o m': 1, 't h e': 11, 's t a r t': 1, 't...","{'From': 1, 'the': 11, 'start': 1, 'this': 2, ...",
