In [85]:
import pandas as pd
import numpy as np
import string
from unidecode import unidecode
import re
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import models
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import string
import os
import cloudpickle
import sys
os.chdir("..")
sys.path.insert(0,os.path.abspath(os.path.join(".")))
from preprocessing.text_preprocessing import TextPreprocessor

In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
def get_train_data():
    # amazon_train = pd.read_csv(os.path.join("..", "data", "amazon_train.csv"), header=None, names=["rating", 'st', "text"])
    # amazon_train.drop(columns=['st'], inplace=True)
    # amazon_train.dropna(inplace=True)
    # x1, x2 = train_test_split(amazon_train, test_size=0.05, stratify=amazon_train['rating'], random_state=42)
    # yelp_train = pd.read_csv(os.path.join("..", "data", "yelp_train.csv"), header=None, names=["rating", "text"])
    # train = pd.concat([yelp_train, x2], axis=0, ignore_index=True)
    # train.to_parquet(os.path.join("..", "data", "yelp_amazon_train.parquet"))
    train = pd.read_parquet(os.path.join(".", "data", "yelp_amazon_train.parquet"))
    return train

In [4]:
data = get_train_data()

In [5]:
data.head()

Unnamed: 0,rating,text
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [6]:
data.tail()

Unnamed: 0,rating,text
739995,1,I bought a similar portable DVD player from Po...
739996,2,This is a great book for people wanting to get...
739997,2,"Here I Am Dionne Warwick, Scepter Label origin..."
739998,2,the book arrived in the condition described by...
739999,1,"The product seems to be fine, but I ordered th..."


In [7]:
data.shape

(740000, 2)

In [8]:
data.dtypes

rating     int64
text      object
dtype: object

In [9]:
x = data['text']
y = data['rating']

<h2>Text Preprocessing

In [None]:
# tp = TextPreprocessor()

In [None]:
# x_preprocessed = tp.preprocess(x)

In [None]:
# save preprocessed data for future use
# pd.DataFrame({'rating': y, 'text': x_preprocessed}).to_parquet(os.path.join(".", "data", "x_train_preprocessed_no_lemmatize.parquet"))

In [11]:
train_preprocessed = pd.read_parquet("./data/x_train_preprocessed_no_lemmatize.parquet")

In [12]:
train_preprocessed

Unnamed: 0,rating,text
0,1,unfortunately frustration dr goldberg patient ...
1,2,going dr goldberg over years think one st pati...
2,1,not know dr goldberg like moving arizona but l...
3,1,writing review give heads up see doctor office...
4,2,food great but best thing wings wings simply f...
...,...,...
739995,1,bought similar portable dvd player polaroid am...
739996,2,great book people wanting get programming wind...
739997,2,dionne warwick scepter label originally releas...
739998,2,book arrived condition described dealer deligh...


In [13]:
[*data['text'][:5]]

["Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.",
 "Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doe

In [14]:
[*train_preprocessed['text'][:5]]

['unfortunately frustration dr goldberg patient repeat experience many doctors nyc good doctor terrible staff seems staff simply never answers phone usually takes hours repeated calling get answer time wants deal run problem many doctors not get office workers patients medical needs not anyone answering phone incomprehensible not work aggravation regret feel give dr goldberg stars',
 'going dr goldberg over years think one st patients started mhmg great over years really big picture not former gyn dr markoff found fibroids explores options very patient understanding not judge asks right questions very thorough wants kept loop every aspect medical health life',
 'not know dr goldberg like moving arizona but let tell stay away doctor office going dr johnson left goldberg took over johnson left not caring doctor interested co pay come medication refills every month not give refills could less patients financial situations trying get days mail away pharmacy prescriptions guy joke make matt

In [15]:
x_train_preprocessed = train_preprocessed['text'].copy()
y_train_preprocessed = train_preprocessed['rating'].copy()
x_train_preprocessed.shape, y_train_preprocessed.shape

((740000,), (740000,))

In [16]:
del(data)
del(train_preprocessed)

<h2> Analyze no. of words/tokens in texts

In [17]:
n_words_x_train = pd.Series([len(t.split(" ")) for t in x_train_preprocessed])

In [18]:
n_words_x_train.describe()

count    740000.000000
mean         63.651431
std          58.782984
min           1.000000
25%          26.000000
50%          47.000000
75%          80.000000
max         669.000000
dtype: float64

In [19]:
n_words_x_train.quantile(0.05)

11.0

In [20]:
n_words_x_train.quantile(0.9)

132.0

In [21]:
y_train_preprocessed.value_counts()

1    370000
2    370000
Name: rating, dtype: int64

In [22]:
x_train_preprocessed[:10]

0    unfortunately frustration dr goldberg patient ...
1    going dr goldberg over years think one st pati...
2    not know dr goldberg like moving arizona but l...
3    writing review give heads up see doctor office...
4    food great but best thing wings wings simply f...
5    wing sauce like water pretty much lot butter s...
6    owning driving range inside city limits like l...
7    place absolute garbage half tees not available...
8    finally made over range heard thing most peopl...
9    drove yesterday get sneak peak re opens july t...
Name: text, dtype: object

In [23]:
y_train_preprocessed[:10]

0    1
1    2
2    1
3    1
4    2
5    1
6    1
7    1
8    2
9    2
Name: rating, dtype: int64

In [24]:
tokenizer = Tokenizer()

In [25]:
tokenizer.fit_on_texts(x_train_preprocessed)

In [26]:
word_index = pd.DataFrame(tokenizer.word_index, index=[0]).T.reset_index()
word_index.columns = ['word', 'index']

In [27]:
word_count = pd.DataFrame(tokenizer.word_counts, index=[0]).T.reset_index()
word_count.columns = ['word', 'count']

In [28]:
word_data = word_index.merge(right=word_count, on="word", how="inner")

In [29]:
word_data.sort_values(by="count", ascending=False, inplace=True)

In [30]:
word_data.loc[word_data['count'] > 3000]

Unnamed: 0,word,index,count
0,not,1,1181685
1,but,2,680024
2,good,3,345376
3,food,4,337372
4,place,5,327489
...,...,...,...
2127,mon,2128,3018
2128,fight,2129,3013
2129,hollywood,2130,3009
2130,chunks,2131,3007


In [31]:
considered_words = word_data.loc[word_data['count'] > 3000, 'word'] # considering words with count > 3000 ~ 0.5% of doc count
considered_words = [*considered_words]
considered_words[:15]

['not',
 'but',
 'good',
 'food',
 'place',
 'would',
 'like',
 'one',
 'very',
 'get',
 'great',
 'up',
 'time',
 'no',
 'service']

In [32]:
x_train_preprocessed

0         unfortunately frustration dr goldberg patient ...
1         going dr goldberg over years think one st pati...
2         not know dr goldberg like moving arizona but l...
3         writing review give heads up see doctor office...
4         food great but best thing wings wings simply f...
                                ...                        
739995    bought similar portable dvd player polaroid am...
739996    great book people wanting get programming wind...
739997    dionne warwick scepter label originally releas...
739998    book arrived condition described dealer deligh...
739999    product seems fine but ordered wrong thing wan...
Name: text, Length: 740000, dtype: object

<h2>Finding frequency of considered words in each doc

In [33]:
# num_considered_words = []
# for i in tqdm(range(len(x_train_preprocessed))):
#     cnt = 0
#     for j in x_train_preprocessed[i].split():
#         if j in considered_words:
#             cnt += 1
#     num_considered_words.append(cnt)

100%|████████████████████████████████████████████████████████████████████████| 740000/740000 [09:18<00:00, 1324.72it/s]


In [34]:
# num_considered_words = np.array(num_considered_words)

In [36]:
# print(len(np.where(num_considered_words==0)[0])) # 254 has 0 words
# print(len(np.where(num_considered_words<=10)[0])) # 58988 has 10 or less words

234
58988


In [37]:
# idx_having_considered_words = np.where(num_considered_words>0)[0]

In [38]:
# len(idx_having_considered_words) # no of docs having atleast 1 considered word

739766

In [39]:
# idx_not_having_considered_words = np.where(num_considered_words == 0)[0]
# len(idx_not_having_considered_words) # no of docs without atleast 1 consdered word

234

In [40]:
# [*x_train_preprocessed[idx_not_having_considered_words]] # docs without atleast 1 consdered word

['hoofah',
 'ewwww',
 'iga',
 'consumer alert rats',
 'exquisite',
 'booooo',
 'cluster',
 'eeeeeeewwwwwwwwwwwwwwwwww',
 'eh',
 'tor ti llas delicioso andale mi amgos',
 'trashy rag besmirches heroes crossed ad revenue plummets',
 'ihana paikka hieman phoenixin keskustan ulkopuolella farm tarjoaa freesej herkullisia salaatteja sek maukkaita sandwicheja ja tietenkin laajan valikoiman erilaisia lkiruokaherkkuja ainekset ovat tuoreita suoraan omalta tilalta tuotuja palvelu yst llist ja tunnelma todella viihtyis ruokailun lkeen voi viett hetken vellen pekaanipuulehdossa paikassa voi rjest vaikka juhlia pime ll paikkaa valaisevat pekaanipuiden leille riputetut sadat hehkulamput',
 'stunning foliage bitey',
 'eh',
 'deliciousness',
 'meth',
 'yumm',
 'fond memories',
 'bomb',
 'deleted',
 'wal mart',
 'df ea ef fd ea',
 'ew',
 'grosss',
 'bros',
 'nieces blast sever',
 'quoth theyerg nevermore',
 'vvvvvvvvvveeeeeeeerrrrrryyyyyyyyyyyyy sssssssssssslllllllllllllllllooooooooooooooooowwwwwwwwwww

In [41]:
# saving considered words array for future use
# with open("idx_considered_words.npy", "wb") as f:
#     cloudpickle.dump(idx_having_considered_words, f)

with open("idx_considered_words.npy", "rb") as f:
    idx_having_considered_words = cloudpickle.load(f)    

In [42]:
idx_having_considered_words

array([     0,      1,      2, ..., 739997, 739998, 739999], dtype=int64)

In [43]:
x_train_preprocessed = x_train_preprocessed[idx_having_considered_words].copy() # removing docs with no considered words
len(x_train_preprocessed)

739766

In [44]:
y_train_preprocessed = y_train_preprocessed[idx_having_considered_words].copy()
len(y_train_preprocessed)

739766

In [45]:
y_train_preprocessed

0         1
1         2
2         1
3         1
4         2
         ..
739995    1
739996    2
739997    2
739998    2
739999    1
Name: rating, Length: 739766, dtype: int64

In [46]:
y_train_preprocessed[y_train_preprocessed == 1] = 0
y_train_preprocessed[y_train_preprocessed == 2] = 1

In [47]:
y_train_preprocessed

0         0
1         1
2         0
3         0
4         1
         ..
739995    0
739996    1
739997    1
739998    1
739999    0
Name: rating, Length: 739766, dtype: int64

In [48]:
y_train_preprocessed.value_counts()

1    369891
0    369875
Name: rating, dtype: int64

In [49]:
x_train_preprocessed

0         unfortunately frustration dr goldberg patient ...
1         going dr goldberg over years think one st pati...
2         not know dr goldberg like moving arizona but l...
3         writing review give heads up see doctor office...
4         food great but best thing wings wings simply f...
                                ...                        
739995    bought similar portable dvd player polaroid am...
739996    great book people wanting get programming wind...
739997    dionne warwick scepter label originally releas...
739998    book arrived condition described dealer deligh...
739999    product seems fine but ordered wrong thing wan...
Name: text, Length: 739766, dtype: object

<h2>Tokenizing, text to seq and padding

In [50]:
tokenizer = Tokenizer(num_words=2300) # considering only top 2,300 freq words based on above analysis

In [51]:
tokenizer.fit_on_texts(x_train_preprocessed)

In [52]:
vocab_size = 2300 + 1
vocab_size

2301

In [53]:
x_train_seq = tokenizer.texts_to_sequences(x_train_preprocessed)

In [54]:
x_train_seq[0]

[501,
 779,
 1592,
 1976,
 71,
 88,
 3,
 1229,
 346,
 67,
 345,
 67,
 576,
 28,
 335,
 295,
 613,
 254,
 1349,
 10,
 1102,
 13,
 1263,
 320,
 443,
 284,
 88,
 1,
 10,
 539,
 1321,
 486,
 1,
 269,
 335,
 1,
 98,
 1921,
 149,
 80,
 779,
 151]

In [55]:
maxlen = 130 # constraining token/word lenght to 130 as 90% of sents have token count <= 130 
maxlen

130

In [56]:
PADDING="post"
TRUNCATING="post"
x_train_seq = pad_sequences(x_train_seq, maxlen=maxlen, padding=PADDING, truncating=TRUNCATING)

In [57]:
x_train_seq[0]

array([ 501,  779, 1592, 1976,   71,   88,    3, 1229,  346,   67,  345,
         67,  576,   28,  335,  295,  613,  254, 1349,   10, 1102,   13,
       1263,  320,  443,  284,   88,    1,   10,  539, 1321,  486,    1,
        269,  335,    1,   98, 1921,  149,   80,  779,  151,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

In [58]:
np.max([len(s) for s in x_train_seq]) # shud be = maxlen

130

In [59]:
x_train, x_val, y_train, y_val = train_test_split(x_train_seq, y_train_preprocessed, test_size=0.15, random_state=42, stratify=y_train_preprocessed)

In [60]:
del(x_train_seq, y, y_train_preprocessed)

In [61]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((628801, 130), (628801,), (110965, 130), (110965,))

In [62]:
y_train.dtypes, y_val.dtypes

(dtype('int64'), dtype('int64'))

In [63]:
y_train

241589    0
541337    0
3767      0
129678    1
668189    0
         ..
110932    1
321118    0
366425    0
711287    1
123475    0
Name: rating, Length: 628801, dtype: int64

In [64]:
y_train.value_counts()

1    314407
0    314394
Name: rating, dtype: int64

In [65]:
y_train = to_categorical(y_train)

In [66]:
y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [67]:
y_train.shape

(628801, 2)

In [68]:
y_val

461872    0
128920    0
287387    1
337729    0
127361    0
         ..
303977    1
482958    1
702863    1
92446     1
166541    1
Name: rating, Length: 110965, dtype: int64

In [69]:
y_val.value_counts()

1    55484
0    55481
Name: rating, dtype: int64

In [70]:
y_val = to_categorical(y_val)
y_val.shape

(110965, 2)

<h2>Model Training

In [72]:
early_stop = EarlyStopping(monitor="val_loss", patience=5)

In [73]:
# model with best hyperparams
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=80, input_length=maxlen))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 80)           21541280  
                                                                 
 lstm (LSTM)                 (None, 50)                26200     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 100)               5100      
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 21,572,782
Trainable params: 21,572,782
No

In [74]:
model.fit(x_train, y_train, epochs=200, verbose=2, validation_data=(x_val, y_val), callbacks=[early_stop], batch_size=256)

Epoch 1/200
2457/2457 - 128s - loss: 0.5805 - binary_accuracy: 0.7171 - val_loss: 0.5272 - val_binary_accuracy: 0.7367 - 128s/epoch - 52ms/step
Epoch 2/200
2457/2457 - 102s - loss: 0.5122 - binary_accuracy: 0.7763 - val_loss: 0.6798 - val_binary_accuracy: 0.5018 - 102s/epoch - 42ms/step
Epoch 3/200
2457/2457 - 105s - loss: 0.3386 - binary_accuracy: 0.8641 - val_loss: 0.2472 - val_binary_accuracy: 0.8953 - 105s/epoch - 43ms/step
Epoch 4/200
2457/2457 - 108s - loss: 0.2280 - binary_accuracy: 0.9074 - val_loss: 0.2097 - val_binary_accuracy: 0.9142 - 108s/epoch - 44ms/step
Epoch 5/200
2457/2457 - 109s - loss: 0.2007 - binary_accuracy: 0.9186 - val_loss: 0.1959 - val_binary_accuracy: 0.9196 - 109s/epoch - 44ms/step
Epoch 6/200
2457/2457 - 139s - loss: 0.1856 - binary_accuracy: 0.9252 - val_loss: 0.1859 - val_binary_accuracy: 0.9233 - 139s/epoch - 56ms/step
Epoch 7/200
2457/2457 - 154s - loss: 0.1751 - binary_accuracy: 0.9298 - val_loss: 0.1820 - val_binary_accuracy: 0.9251 - 154s/epoch - 62

<keras.callbacks.History at 0x27d5fa24b88>

In [95]:
model.save(os.path.join("models", "review_sentiment_model.h5"), include_optimizer=False)



In [96]:
with open(os.path.join("models", "preprocessors.bin"), "wb") as f:
    cloudpickle.dump((tokenizer, maxlen, PADDING, TRUNCATING), f)

In [114]:
model = models.load_model(os.path.join("models", "review_sentiment_model.h5"), compile=False)
with open(os.path.join("models", "preprocessors.bin"), "rb") as f:
    tokenizer, maxlen, padding, truncating = cloudpickle.load(f)
    
def model_inference(input_str: str):
    """
    Returns model prediction
    """
    text_ser = pd.Series(input_str)
    tp = TextPreprocessor()
    preprocessed_str = tp.preprocess(text_ser, dataset="test")
    txt_seq = tokenizer.texts_to_sequences(preprocessed_str)
    txt_seq = pad_sequences(txt_seq, maxlen=maxlen, padding=padding, truncating=truncating)
    pred = model.predict(txt_seq)[0].astype("str")
    response = {"result": ["negative", "positive"], "confidence": [*pred]}
    return response

In [115]:
model_inference("doesn't register a click sometimes which doesnt help in an fps game")



{'result': ['negative', 'positive'],
 'confidence': ['0.95128065', '0.048719294']}

In [116]:
model_inference("Very nice working ps4 100% working thanks amozon so much")



{'result': ['negative', 'positive'],
 'confidence': ['0.023427645', '0.97657233']}

In [117]:
model_inference("It works well")



{'result': ['negative', 'positive'], 'confidence': ['0.040667', '0.95933294']}

In [118]:
model_inference("It's been only 7days and I am already facing the problems with this mouse pad.")



{'result': ['negative', 'positive'], 'confidence': ['0.812869', '0.18713103']}