In [40]:
import pandas as pd

In [41]:
df = pd.read_csv('reviews.csv') 
df.head()

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25T21:44:00Z,2000-01-25T21:44:00Z
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17T16:49:59Z,2001-10-17T16:49:59Z
2,9,4523,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,2000-02-25T09:00:00Z,2000-02-25T09:00:00Z
3,13,7435,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,2000-03-13T21:15:00Z,2000-03-13T21:15:00Z
4,14,44,2085,Tony Small,5,An excellent dish.,2000-03-28T12:51:00Z,2000-03-28T12:51:00Z


In [42]:
df.isnull().sum()

ReviewId           0
RecipeId           0
AuthorId           0
AuthorName         0
Rating             0
Review           214
DateSubmitted      0
DateModified       0
dtype: int64

In [43]:
df = df.dropna(subset=['Review'])
df.isnull().sum()

ReviewId         0
RecipeId         0
AuthorId         0
AuthorName       0
Rating           0
Review           0
DateSubmitted    0
DateModified     0
dtype: int64

In [44]:
df['cleaned_reviews'] = df['Review'].str.lower()

In [45]:
df['cleaned_reviews'].head()

0         better than any you can get at a restaurant!
1    i cut back on the mayo, and made up the differ...
2    i think i did something wrong because i could ...
3    easily the best i have ever had.  juicy flavor...
4                                   an excellent dish.
Name: cleaned_reviews, dtype: object

In [46]:
df['cleaned_reviews'] = df['cleaned_reviews'].str.replace('[^\w\s]','') 
df['cleaned_reviews'].head() 


  df['cleaned_reviews'] = df['cleaned_reviews'].str.replace('[^\w\s]','')


0          better than any you can get at a restaurant
1    i cut back on the mayo and made up the differe...
2    i think i did something wrong because i could ...
3    easily the best i have ever had  juicy flavorf...
4                                    an excellent dish
Name: cleaned_reviews, dtype: object

In [47]:
import nltk
from nltk.corpus import stopwords 

In [48]:
# nltk.download('stopwords')

In [49]:
stop_words = stopwords.words('english')

In [50]:
def remove_sw(text):
    filtered_words = []
    for x in text: 
        if x not in stop_words:
            filtered_words.append(x)
    
    return filtered_words 

In [51]:
df['cleaned_reviews'] = df['cleaned_reviews'].apply(lambda x: x.split(" "))
df['cleaned_reviews'] = df['cleaned_reviews'].apply(lambda x: remove_sw(x))

In [52]:
df['cleaned_reviews'].head()

0                            [better, get, restaurant]
1    [cut, back, mayo, made, difference, sour, crea...
2    [think, something, wrong, could, taste, cornst...
3    [easily, best, ever, , juicy, flavorful, dry, ...
4                                    [excellent, dish]
Name: cleaned_reviews, dtype: object

In [53]:
import numpy as np 

df['is_positive'] = np.where(df['Rating']<3, 0, 1) 

df[['cleaned_reviews','is_positive']]

Unnamed: 0,cleaned_reviews,is_positive
0,"[better, get, restaurant]",1
1,"[cut, back, mayo, made, difference, sour, crea...",1
2,"[think, something, wrong, could, taste, cornst...",0
3,"[easily, best, ever, , juicy, flavorful, dry, ...",1
4,"[excellent, dish]",1
...,...,...
1401977,"[disappointed, couldnt, wait, make, husband, b...",0
1401978,"[nothing, drain, dont, heat, liquids, put, mil...",1
1401979,"[good, base, recipe, someone, start, quadruple...",1
1401980,"[thank, much, amazing, recipe, lived, kenai, s...",1


In [54]:
df['cleaned_reviews'] = df['cleaned_reviews'].apply(lambda x: list(filter(None,x)))
        

max_word = max(df.cleaned_reviews, key=len) 
len(max_word)

# import urllib.request 

# url = 'https://nlp.stanford.edu/data/glove.6B.zip'
# filename = 'glove.6b.zip'
# urllib.request.urlretrieve(url,filename)

661

In [55]:
df = df[df.cleaned_reviews.str.len() < 100]

max_word = max(df.cleaned_reviews, key=len) 
len(max_word)

# import zipfile 

# with zipfile.ZipFile('glove.6b.zip', 'r') as zip: 
#     zip.extractall()

99

In [56]:
word_vectors = dict()

def add_wordvector(dict, filename):
    with open(filename, 'r', encoding='utf8') as f:
        for line in f.readlines():
            line = line.split(' ')

            try:
                if line[0] not in stop_words:
                    dict[line[0]] = np.array(line[1:], dtype=float)
            except:
                continue 


add_wordvector(word_vectors, 'glove.6B.50d.txt')
len(stop_words)

179

In [57]:
len(word_vectors)

399851

In [58]:
from nltk.stem import WordNetLemmatizer

# nltk.download('wordnet')
# nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

In [59]:
def lemmatize_token(text):
    lemmatized_token = [lemmatizer.lemmatize(x) for x in text] 
    useful_token = [x for x in lemmatized_token if x in word_vectors]

    return useful_token 

word_token = df['cleaned_reviews'][0] 

lemmatize_token(word_token)


['better', 'get', 'restaurant']

In [60]:
def text_to_vector(text, token_dict=word_vectors):
    processed_token = lemmatize_token(text)

    vectors = []

    for x in processed_token:
        if x not in word_vectors:
            continue 
        
        vectors.append(token_dict[x]) 

    return np.array(vectors, dtype=float) 
    

In [61]:
def vectorize_review(df):
    label = df['is_positive'].to_numpy().astype(int)

    len_wordvector = []

    for text in df['cleaned_reviews']:
        text_as_vector = text_to_vector(text) 

        if text_as_vector.shape[0] == 0:
            text_as_vector = np.zeros(shape=(1,50)) 

        len_wordvector.append(text_as_vector)

    return len_wordvector, label

In [62]:
train_df = df.sample(frac=.36, random_state=1)
train_df.reset_index(drop=True, inplace=True) 

split_index_1 = int(len(train_df) * 0.7)
split_index_2 = int(len(train_df) * 0.85)

train_df, val_df, test_df = train_df[:split_index_1], train_df[split_index_1:split_index_2], train_df[split_index_2:]

len(train_df), len(val_df), len(test_df)

(350481, 75103, 75104)

In [63]:
x_train, y_train = vectorize_review(train_df) 

len(x_train), len(x_train[0]), len(x_train[1])

(350481, 18, 14)

In [64]:
token_len = []

for i in range (len(x_train)):
    token_len.append(len(x_train[i])) 

pd.Series(token_len).describe()


count    350481.000000
mean         25.146704
std          16.205611
min           1.000000
25%          13.000000
50%          22.000000
75%          33.000000
max          99.000000
dtype: float64

In [65]:
# from copy import deepcopy 

# def zero_padding(x, max_token_len=605):
#     x_copy = deepcopy(x)

#     for i, j in enumerate(x):
#         x_token_len = j.shape[0]
#         token_len_diff = max_token_len - x_token_len 

#         pad = np.zeros(shape=(token_len_diff, 50))

#         x_copy[i] = np.concatenate([j, pad]) 

#     return np.array(x_copy).astype(float)

In [66]:
# x_train = zero_padding(x_train)

# x_train.shape

In [67]:
import tensorflow as tf

In [68]:
x_train = tf.keras.utils.pad_sequences(x_train, maxlen=100, dtype='float32', padding='post') 

x_train.shape


# tf.keras.utils.pad_sequences([[1, 2, 3], [3, 4, 5, 6], [7, 8]], dtype='float32', padding='post')

(350481, 100, 50)

In [69]:
len(x_train[0]), len(x_train[1])

(100, 100)

In [70]:
y_train.shape

(350481,)

In [71]:
x_val, y_val = vectorize_review(val_df)
x_val = tf.keras.utils.pad_sequences(x_val, maxlen=100, dtype='float32', padding='post')

x_val.shape, y_val.shape

((75103, 100, 50), (75103,))

In [72]:
x_test, y_test = vectorize_review(test_df)
x_test = tf.keras.utils.pad_sequences(x_test, maxlen=100, dtype='float32', padding='post')

x_test.shape, y_test.shape

((75104, 100, 50), (75104,))

In [73]:
from tensorflow.keras import layers 
from tensorflow.keras.models import Sequential 

model = Sequential([])

model.add(layers.Input(shape=(100,50))) 
model.add(layers.LSTM(460, return_sequences=True))
model.add(layers.Dropout(0.2))
# model.add(layers.LSTM(64, return_sequences=True))
# model.add(layers.Dropout(0.2))
# model.add(layers.LSTM(64, return_sequences=True))
# model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))

In [74]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 100, 460)          940240    
                                                                 
 dropout_3 (Dropout)         (None, 100, 460)          0         
                                                                 
 flatten_1 (Flatten)         (None, 46000)             0         
                                                                 
 dense_1 (Dense)             (None, 1)                 46001     
                                                                 
Total params: 986,241
Trainable params: 986,241
Non-trainable params: 0
_________________________________________________________________


In [75]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.metrics import AUC 
from tensorflow.keras.callbacks import ModelCheckpoint 

model_cp = ModelCheckpoint('model/', save_best_only=True)

model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(), metrics=['accuracy', AUC(name='auc')])


In [76]:
class_freq = pd.value_counts(train_df['is_positive']) 
class_freq

1    322925
0     27556
Name: is_positive, dtype: int64

In [77]:
weights = {0: class_freq.sum() / (class_freq[0] * 2), 1: class_freq.sum() / (class_freq[1] * 2)} 

weights 

{0: 6.359431702714472, 1: 0.5426662537740962}

In [78]:
model.fit(x_train, y_train, validation_data=(x_val,y_val), epochs=15, callbacks=[model_cp], class_weight=weights)

Epoch 1/15



INFO:tensorflow:Assets written to: model/assets


INFO:tensorflow:Assets written to: model/assets


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15

KeyboardInterrupt: 

In [79]:
from tensorflow.keras.models import load_model 

best_model = load_model('model/')

In [81]:
test_predictions = (best_model.predict(x_test) > 0.5).astype(int)

from sklearn.metrics import classification_report

print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.28      0.56      0.37      5871
           1       0.96      0.88      0.92     69233

    accuracy                           0.85     75104
   macro avg       0.62      0.72      0.65     75104
weighted avg       0.91      0.85      0.87     75104

