# Sentiment Analysis with Recurrent Neural Networks (RNN) 


###Terms:

* Sentiment: 📑 Reading a body of text > determine if its positive or negative (1 positive - 0 negative) ; 0.5 factual 
* NLP : Natural Language Processing 


### Import Data and Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import torch


In [2]:
df = pd.read_csv('cleaned_apparel_dataset.csv')
df_reviews = pd.DataFrame()
df_reviews = df[['review_body','star_rating']]
df_reviews.rename(columns = {'review_body':'feature','star_rating':'label'},inplace = True)
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49974 entries, 0 to 49973
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   feature  49974 non-null  object
 1   label    49974 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [3]:
# Standarize label to 0>1 instead of 1>5
df_reviews_normalized = df_reviews.copy()
df_reviews_normalized['label'] = (df_reviews_normalized['label'] - df_reviews_normalized['label'].min()) / (df_reviews_normalized['label'].max() - df_reviews_normalized['label'].min())    
print(df_reviews_normalized['label'].unique())
# print(df_reviews)

[0.75 1.   0.5  0.25 0.  ]


In [4]:
# Extract from 'reviewsData' the train and test datasets | they are cleaned 
# trainData = reviewsData['train']
# testData = reviewsData['test']

trainData_f, testData_f, trainData_l, testData_l = train_test_split( df_reviews_normalized['feature'], df_reviews_normalized['label'], test_size=0.4, random_state=42)


In [5]:
trainData = pd.DataFrame()
trainData = pd.concat([trainData_f, trainData_l], axis=1)
# trainData
testData = pd.concat([testData_f, testData_l], axis=1)
# testData

In [6]:
# See a glance of the tensor dataset
# for feature, label in trainData.take(5):
#   print('feature (text):',feature.numpy()) 
#   print('label (binary):',label.numpy())

In [7]:
# bufferSize the maximum number of tokens (words) in any text feature
bufferSize = 1000

# batchSize is the number of samples to propogate through the network
batchSize = 64


In [8]:
# DF to Tensor Dataset
trainDataTs = tf.data.Dataset.from_tensor_slices((trainData['feature'],trainData['label']))
testDataTs = tf.data.Dataset.from_tensor_slices((testData['feature'],testData['label']))
# testDataTs = tf.data.Dataset.from_tensor_slices((testData))
# trainDataTs = tf.convert_to_tensor(trainData)

# creating tensor from targets_df 
# trainDataTs = torch.tensor(trainData)

# printing out result
print(trainDataTs)

<TensorSliceDataset shapes: ((), ()), types: (tf.string, tf.float64)>


In [9]:
# trainDataTs = trainData['feature']
for feature, label in trainDataTs.take(10):
  print('feature (text):',feature) 
  print('label (binary):',label)



feature (text): tf.Tensor(b'five stars funny', shape=(), dtype=string)
label (binary): tf.Tensor(1.0, shape=(), dtype=float64)
feature (text): tf.Tensor(b'love love love my hat absolutely love love the hat looks exactly like the picture and very good quality im very happy with my purchase', shape=(), dtype=string)
label (binary): tf.Tensor(1.0, shape=(), dtype=float64)
feature (text): tf.Tensor(b'sexy beautiful waist cincher and comfortable i ordered a xl  it fit me perfectly and i have noticed that after  weeks of use i have to now move the hooks to the second row first of all i love the pink color on the waist trainer with the black lace over its very sexy in my opinioni have used many different waist trainers so i can honestly say that this one is a pretty good waist trainer i love that the hooks and eyelets are sewn in very well in the past i have had issues with flimsy waist trainers where the hooks fall off this is not the case with this waist trainer it is very comfortable to we

In [10]:
# .shuffle() will randomize the dataset
# .batch() set the batch size of sampling
# .prefetch() will cashe the data while an existing batch is being propagates
trainDataTs = trainDataTs.shuffle( buffer_size= bufferSize ).batch(batchSize).prefetch(tf.data.AUTOTUNE)

testDataTs = testDataTs.batch (batchSize).prefetch(tf.data.AUTOTUNE)

### Text Preprocessing

In [11]:
# Vocabulary is the total number of unique words in the collection
# each review > document
# entire reviews > collection

vocabSize = 10000 #if we really dum down the texts we can go down to 1000

# vector is a list for mathmaticians
# Tokenization: splits the words into a vector of tokens ~ words
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocabSize)

# Text encoding
# Use the .map() method extract only the features (reviews) and exclude the label (sentiment)
featureTokens = trainDataTs.map(lambda feature,label: feature)
trainDataTs
encoder.adapt(featureTokens)

In [12]:
# Preview some of the terms before encoding
feature.numpy()[0:50]

b'footsie _ i love this pinksee ladies footies very '

In [13]:
# Preview some of the terms after encoding
encoder(feature)[0:50].numpy()

array([   1,    3,   21,    9, 3918,  847, 3353,   16,  220,    8,   43,
          2,   55,   36,  377,    7,  661,  265,   75,   14,  509,   55,
          9,    7,   44,   24,   13,   41,  261,    7,    9, 3918,  847,
       3353,    7,  459,   22,   97,   65,   11, 1995,   29,  233,   24,
       2052,    8,   50,  122,   45,    8])

In [14]:
# Extract the vocabulay as Numpy array
# and preview of the vocabulary words
vocab = np.array(encoder.get_vocabulary()) # get distinct vocab used in the whole collection
vocab[0:100]


array(['', '[UNK]', 'the', 'i', 'and', 'a', 'it', 'is', 'to', 'this',
       'for', 'my', 'in', 'but', 'of', 'was', 'very', 'not', 'stars',
       'great', 'on', 'love', 'so', 'fit', 'size', 'with', 'that', 'like',
       'as', 'are', 'dress', 'you', 'its', 'small', 'five', 'have',
       'they', 'be', 'too', 'shirt', 'nice', 'good', 'just', 'wear',
       'one', 'fits', 'quality', 'would', 'at', 'perfect', 'me', 'these',
       'well', 'cute', 'really', 'material', 'im', 'or', 'ordered',
       'will', 'little', 'than', 'up', 'all', 'comfortable', 'if', 'them',
       'am', 'more', 'top', 'product', 'br', 'looks', 'an', 'large',
       'made', 'from', 'color', 'can', 'when', 'got', 'out', 'price',
       'what', 'because', 'look', 'get', 'had', 'way', 'expected',
       'fabric', 'beautiful', 'received', 'no', 'buy', 'your', 'soft',
       'even', 'review', 'dont'], dtype='<U67')

### Building the Deep Learning Model

In [15]:
modelRNN = tf.keras.Sequential()

# Add Encoder Layer
modelRNN.add( encoder )

# Add Embedding Layer
modelRNN.add(
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True
    )   
)

# Add Bidirectional Layer
modelRNN.add(
  tf.keras.layers.Bidirectional(
      tf.keras.layers.LSTM(64)
  )
)

# Add Dense Layer
modelRNN.add(
  tf.keras.layers.Dense(64, activation='relu')   
)

# Add Output Layer
modelRNN.add(
  tf.keras.layers.Dense(1)   
)

In [16]:
# Compile the model
modelRNN.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy']
)

In [17]:
# Fit the model (i.e, train the model)
modelRNN.fit(trainDataTs,epochs = 1)




<keras.callbacks.History at 0x7f6017ba8d10>

In [20]:
loss_test,accuracy_test = modelRNN.evaluate(testDataTs)

print('Loss',loss_test)
print('Accuracy',accuracy_test)



Loss 0.40110692381858826
Accuracy 0.6278139352798462


In [34]:
sampleProductReviews = [
  "five stars",
  "amazing",
  "material bad",
  "terrible material",
  'nice',
  'small not fit',
  'bad fit',
  'quality is cheap',
  'elegant color',
  'great color but bad fit',
  ''
]
predictedSentiments = modelRNN.predict(np.array(sampleProductReviews))

print(predictedSentiments)

[[0.7943953 ]
 [0.576152  ]
 [0.16299278]
 [0.02766614]
 [0.29601392]
 [0.05155784]
 [0.17299405]
 [0.2531648 ]
 [0.590955  ]
 [0.38287416]]
