TF-IDF

In [2]:
#ACTIVATE AN INTERACTIVE SESSION FOR GPU UTILITY, ONLY DO THIS ONCE WHEN KERNEL I ACTIVATED TO AVOID OUT-OF-MEMORY ERRORS
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [3]:
#ALL IMPORTS AND DESIGNATED FUNCTIONS

#1. Pandas Dataframe: Read Data, Data Manipulation
import pandas as pd

#2. clean_text function
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


#3. Tokenizer and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#4. Visualisation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

#5. Train test split
from sklearn.model_selection import train_test_split

#6. Keras RNN model
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout, SimpleRNN, GRU
from keras.callbacks import EarlyStopping


DATA PREPARATION

In [4]:
labelled_data = pd.read_json('train.json')
answer_data = pd.read_json('test.json')

print(labelled_data.head())
print(answer_data.head())

sentiments = labelled_data['sentiments']

                                             reviews  sentiments
0  I bought this belt for my daughter in-law for ...           1
1  The size was perfect and so was the color.  It...           1
2  Fits and feels good, esp. for doing a swim rac...           1
3  These socks are absolutely the best. I take pi...           1
4  Thank you so much for the speedy delivery they...           1
                                             reviews
0  I bought 2 sleepers.  sleeper had holes in the...
1  I dare say these are just about the sexiest th...
2  everything about the transaction (price, deliv...
3  Not bad for just a shirt.  Very durable, and m...
4  These are truly wrinkle free and longer than t...


In [5]:
#VARIABLES USED FOR clean_text() function

nltk.download('stopwords')
nltk.download('wordnet')    

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

abbreviation_map = {
    "u": "you",
    "btw": "by the way",
    "omg": "oh my god",
    "idk": "I don't know",
    "lol": "laughing out loud",
    "pls": "please",
    "thx": "thanks",
    "im": "I am",
    "dont": "do not",
    "cant": "cannot",
    "wont": "will not",
}

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
#clean_text function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Expand abbreviations
    text = ' '.join([abbreviation_map.get(word, word) for word in text.split()])
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [7]:
labelled_data['cleaned_reviews'] = labelled_data['reviews'].apply(clean_text)
answer_data['cleaned_reviews'] = answer_data['reviews'].apply(clean_text)

print(f'Labelled data:\n{labelled_data.head()}\n')
print(f'Answer data:\n {answer_data.head()}')

Labelled data:
                                             reviews  sentiments  \
0  I bought this belt for my daughter in-law for ...           1   
1  The size was perfect and so was the color.  It...           1   
2  Fits and feels good, esp. for doing a swim rac...           1   
3  These socks are absolutely the best. I take pi...           1   
4  Thank you so much for the speedy delivery they...           1   

                                     cleaned_reviews  
0         bought belt daughter inlaw christmas loved  
1            size perfect color looked like web page  
2  fit feel good esp swim race highly recommend c...  
3  sock absolutely best take pilate class hot foo...  
4  thank much speedy delivery came time rehearsal...  

Answer data:
                                              reviews  \
0  I bought 2 sleepers.  sleeper had holes in the...   
1  I dare say these are just about the sexiest th...   
2  everything about the transaction (price, deliv...   
3  Not 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)


labelled_tfidf = tfidf_vectorizer.fit_transform(labelled_data['cleaned_reviews']).toarray()
answer_tfidf = tfidf_vectorizer.transform(answer_data['cleaned_reviews']).toarray()

In [9]:
max_len = labelled_tfidf.shape[1]  # The length of your TF-IDF vector
labelled_tfidf_final = pad_sequences(labelled_tfidf, maxlen=max_len)
answer_tfidf_final= pad_sequences(answer_tfidf, maxlen=max_len)

RNN MODEL TRAINING AND TESTING

In [10]:
#TRAIN TEST SPLIT THE RAW DATA

labelled_train, labelled_test, label_train, label_test = train_test_split(
    labelled_tfidf_final, sentiments, test_size=0.2, random_state=42)

In [None]:
n_iter = 0
test_accuracy_list = []
test_loss_list = []

while n_iter < 3:
    #This is a Bidirectional RNN Model
    model = Sequential()
    model.add(Embedding(input_dim=labelled_tfidf_final.shape[1], output_dim=128, input_length=max_len))
    #CuDNN has very strict requirements to be able to use GPU, im putting it here to show the requirements, some of them are already default settings though
    model.add(Bidirectional(GRU(64,
                             activation='tanh',  # Default settings, to show
                             recurrent_activation='sigmoid',  # Default
                             return_sequences=True,  # Must be set to true for the first layer
                             recurrent_dropout=0,  # Must be 0 for cuDNN
                             unroll=False,  # Must be False for cuDNN
                             use_bias=True)))  # Default is True
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

    # Print the model summary
    model.summary()

    snn_model_history = model.fit(labelled_train, label_train, batch_size=32, epochs=10, verbose=1, validation_split=0.2)

    print('Test Accuracy and Loss')
    loss, acc = model.evaluate(labelled_test, label_test)

    test_accuracy_list.append(round(acc, 4))
    test_loss_list.append(round(loss, 4))
    n_iter += 1

print(f'Test accuracy from the three iterations : {test_accuracy_list}')
print(f'Test loss list: {test_loss_list}')


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 1000, 128)         128000    
                                                                 
 bidirectional_6 (Bidirectio  (None, 1000, 128)        74496     
 nal)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 1000, 128)         0         
                                                                 
 dense_6 (Dense)             (None, 1000, 1)           129       
                                                                 
Total params: 202,625
Trainable params: 202,625
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 