Data Preparation

In [1]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [2]:
import pandas as pd
import string
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Changed the 'train_data' into 'raw_data' and changed 'test_data' into 'answer_data'

raw_data = pd.read_json('train.json')
answer_data = pd.read_json('test.json')

print(raw_data.head())
print(answer_data.head())

raw_labels = raw_data['sentiments']

                                             reviews  sentiments
0  I bought this belt for my daughter in-law for ...           1
1  The size was perfect and so was the color.  It...           1
2  Fits and feels good, esp. for doing a swim rac...           1
3  These socks are absolutely the best. I take pi...           1
4  Thank you so much for the speedy delivery they...           1
                                             reviews
0  I bought 2 sleepers.  sleeper had holes in the...
1  I dare say these are just about the sexiest th...
2  everything about the transaction (price, deliv...
3  Not bad for just a shirt.  Very durable, and m...
4  These are truly wrinkle free and longer than t...


In [4]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [5]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

In [6]:
raw_data['cleaned_reviews'] = raw_data['reviews'].apply(clean_text)
answer_data['cleaned_reviews'] = answer_data['reviews'].apply(clean_text)

In [7]:
print(raw_data.head())

                                             reviews  sentiments  \
0  I bought this belt for my daughter in-law for ...           1   
1  The size was perfect and so was the color.  It...           1   
2  Fits and feels good, esp. for doing a swim rac...           1   
3  These socks are absolutely the best. I take pi...           1   
4  Thank you so much for the speedy delivery they...           1   

                                     cleaned_reviews  
0         bought belt daughter inlaw christmas loved  
1            size perfect color looked like web page  
2  fits feels good esp swim race highly recommend...  
3  socks absolutely best take pilates classes hot...  
4  thank much speedy delivery came time rehearsal...  


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization: Basically divides the sentences into segments
tokenizer = Tokenizer()
tokenizer.fit_on_texts(raw_data['cleaned_reviews']) #Indexes each token (each word is given a number). Index = 0 is for padding

# Convert text to sequences
raw_sequences = tokenizer.texts_to_sequences(raw_data['cleaned_reviews'])
answer_sequences = tokenizer.texts_to_sequences(answer_data['cleaned_reviews'])

# Padding sequences
max_length = max(max(len(seq) for seq in raw_sequences), max(len(seq) for seq in answer_sequences)) #Finding the maximum sequence length from both the raw_data and answer_data
raw_final = pad_sequences(raw_sequences, maxlen=max_length, padding='post') #standardize the array
answer_final = pad_sequences(answer_sequences, maxlen=max_length, padding='post')

# Display the shape of padded sequences
print(f'Train padded shape: {raw_final.shape}')
print(f'Test padded shape: {answer_final.shape}')

Train padded shape: (7401, 518)
Test padded shape: (1851, 518)


In [9]:
raw_final

array([[  12,  288,  214, ...,    0,    0,    0],
       [   5,   39,   41, ...,    0,    0,    0],
       [  86,  323,    7, ...,    0,    0,    0],
       ...,
       [1309,  401,  217, ...,    0,    0,    0],
       [ 143,  957,  380, ...,    0,    0,    0],
       [ 230,  906,   97, ...,    0,    0,    0]])

In [10]:
answer_final

array([[  12,  116, 7406, ...,    0,    0,    0],
       [3212,  123, 5234, ...,    0,    0,    0],
       [ 172, 1543,   18, ...,    0,    0,    0],
       ...,
       [  15,  108,  211, ...,    0,    0,    0],
       [ 214,  132,   53, ...,    0,    0,    0],
       [3179,  917,  153, ...,    0,    0,    0]])

In [11]:
import numpy as np

# Load GloVe word embeddings (download and extract glove.6B.100d.txt)
embedding_dim = 100 #as vectors of n real numbers
glove_file = 'C:/Users/User/Downloads/Telegram Desktop/glove.6B/glove.6B.100d.txt' #change this to your glove path
embeddings_index = {}

with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f'Loaded {len(embeddings_index)} word vectors from GloVe.')


Loaded 400000 word vectors from GloVe.


In [12]:
# Prepare the embedding matrix
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Map the words in the tokenizer's vocabulary to GloVe vectors
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector  # Words found in GloVe will use pretrained vectors

# Check the embedding matrix shape
print(f'Embedding matrix shape: {embedding_matrix.shape}')

Embedding matrix shape: (16366, 100)


In [13]:
max_length

518

In [14]:
#train test split the raw data
from sklearn.model_selection import train_test_split
raw_train, raw_test, label_train, label_test = train_test_split(
    raw_final, raw_labels, test_size=0.2, random_state=42)

In [18]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional


# Define your model
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=True))
#CuDNN has very strict requirements to be able to use GPU, im putting it here to show the requirements, some of them are already default settings though
model.add(Bidirectional(LSTM(128,
                             activation='tanh',  # Default settings, to show
                             recurrent_activation='sigmoid',  # Default
                             return_sequences=False,  # Set appropriately, not Default
                             recurrent_dropout=0,  # Must be 0 for cuDNN
                             unroll=False,  # Must be False for cuDNN
                             use_bias=True)))  # Default is True
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# Print the model summary
model.summary()

snn_model_history = model.fit(raw_train, label_train, batch_size=256, epochs=20, verbose=1, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 518, 100)          1636600   
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              234496    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 1,871,353
Trainable params: 1,871,353
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
loss, acc = model.evaluate(raw_test, label_test)



