# Task B
***

In [1]:
# Importing library

import numpy as np
import pandas as pd
import keras
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
import keras_tuner as kt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, log_loss
from sklearn.metrics import plot_confusion_matrix

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from collections import Counter
import string
import re

# 1. Read database

In [2]:
# loading  dataset

data = pd.read_csv('train.csv')
data_realtest = pd.read_csv('test.csv')

In [3]:
review = []
my_stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
vocabulary = {}
review_size = []

def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return nltk.tokenize.word_tokenize(nopunct)


for i in range(len(data['rating'])):
    review.append(data['reviewText'][i] +' '+ data['summary'][i])

counts = Counter()

for i in range(len(review)):
    r = []
    tokens = tokenize(review[i])
    for t in tokens:
        if t not in my_stopwords and len(t) > 1:
            r.append(lemmatizer.lemmatize(t))
            
    review[i] = r
    counts.update(r)
    review_size.append(len(r))
    
data['review'] = review

review = []
review_size = []

for i in range(len(data_realtest['Id'])):
    review.append(data_realtest['reviewText'][i] +' '+ data_realtest['summary'][i])

counts = Counter()

for i in range(len(review)):
    r = []
    tokens = tokenize(review[i])
    for t in tokens:
        if t not in my_stopwords and len(t) > 1:
            r.append(lemmatizer.lemmatize(t))
            
    review[i] = r
    counts.update(r)
    review_size.append(len(r))
    
data_realtest['review'] = review

# 2. Data Cleaning

In [4]:
'''
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
'''

"\nnltk.download('stopwords')\nnltk.download('wordnet')\nnltk.download('punkt')\n"

In [5]:
'''
stop = stopwords.words('english')

data['reviewText'] = data['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data['summary'] = data['summary'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['reviewText'] = data.reviewText.apply(lemmatize_text)
data['summary'] = data.summary.apply(lemmatize_text)

data.head()
'''

'\nstop = stopwords.words(\'english\')\n\ndata[\'reviewText\'] = data[\'reviewText\'].apply(lambda x: \' \'.join([word for word in x.split() if word not in (stop)]))\ndata[\'summary\'] = data[\'summary\'].apply(lambda x: \' \'.join([word for word in x.split() if word not in (stop)]))\n\nw_tokenizer = nltk.tokenize.WhitespaceTokenizer()\nlemmatizer = nltk.stem.WordNetLemmatizer()\ndef lemmatize_text(text):\n    st = ""\n    for w in w_tokenizer.tokenize(text):\n        st = st + lemmatizer.lemmatize(w) + " "\n    return st\ndata[\'reviewText\'] = data.reviewText.apply(lemmatize_text)\ndata[\'summary\'] = data.summary.apply(lemmatize_text)\n\ndata.head()\n'

In [6]:
data.isnull().sum()

rating        0
reviewText    0
summary       0
review        0
dtype: int64

In [7]:
data.rating.value_counts()

4    2400
5    2200
1    1700
2    1500
3    1200
Name: rating, dtype: int64

In [8]:
'''
vectorizer = CountVectorizer(stop_words = 'english',min_df=2)
reviews = vectorizer.fit_transform(data['reviewText'])
summary = vectorizer.fit_transform(data['summary'])
'''

"\nvectorizer = CountVectorizer(stop_words = 'english',min_df=2)\nreviews = vectorizer.fit_transform(data['reviewText'])\nsummary = vectorizer.fit_transform(data['summary'])\n"

# 3. Preprocessing

1. Preprocess your data so you remove conjunctions, stop words, and "junk" from tweets.<br>
2. lemmatization<br>
3. Analyze the words with the top frequencies. Are these words that convey sentiment? Could they be removed in your preprocessing? The tokenizer records the first N unique words until the dictionary has num_words in it, so these popular words are much more likely to be in your dictionary

In [9]:
# split data into X and y
X = data.review
y = data.rating
X_realtest = data_realtest.review

# 60:20:20 split
X_tv, X_test, y_tv, y_test = train_test_split(X,y,test_size=0.20,random_state=0)
X_train, X_vali, y_train, y_vali = train_test_split(X_tv, y_tv, test_size = 1/4,random_state=0) 

y_train_array = np.array(y_train)
y_vali_array = np.array(y_vali)
y_tv_array = np.array(y_tv)
y_test_array = np.array(y_test)

X_train.shape, X_test.shape

((5400,), (1800,))

In [10]:
from sklearn.preprocessing import LabelEncoder
# One Hot Encode Y values:
encoder = LabelEncoder()

y_train = encoder.fit_transform(y_train.values)
y_train = to_categorical(y_train) 

y_vali = encoder.fit_transform(y_vali.values)
y_vali = to_categorical(y_vali) 

y_tv = encoder.fit_transform(y_tv.values)
y_tv = to_categorical(y_tv) 

y_test = encoder.fit_transform(y_test.values)
y_test = to_categorical(y_test) 

In [11]:
tokenizer = Tokenizer(lower=False) # num_words:the maximum number of words to keep, based on word frequency
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_vali = tokenizer.texts_to_sequences(X_vali)
sequences_tv = tokenizer.texts_to_sequences(X_tv)
sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_realtest = tokenizer.texts_to_sequences(X_realtest)

vocab_size = len(tokenizer.word_index) + 1

padded_sequence_train = pad_sequences(sequences_train, maxlen=600) # maxlen, higher num takes longer to run
padded_sequence_vali = pad_sequences(sequences_vali, maxlen=600)
padded_sequence_tv = pad_sequences(sequences_tv, maxlen=600)
padded_sequence_test = pad_sequences(sequences_test, maxlen=600)
padded_sequence_realtest = pad_sequences(sequences_realtest, maxlen=600)

print('Number of Tokens:', len(tokenizer.word_index))
print("Max Token Index:", padded_sequence_train.max(), "\n")

print('Sample Before Processing:', X_train.values[0])
print('Sample After Processing:', tokenizer.sequences_to_texts([padded_sequence_train[0]]), '\n')

print('What the model will interpret:', padded_sequence_train[0].tolist())

Number of Tokens: 18931
Max Token Index: 18931 

Sample Before Processing: ['book', 'short', 'even', 'qualify', 'novella', 'guess', 'get', 'pay', 'free', 'seriously']
Sample After Processing: ['book short even qualify novella guess get pay free seriously'] 

What the model will interpret: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [12]:
padded_sequence_test.shape

(1800, 600)

# 4. Hyperparameter Tuning

In [13]:
# https://keras.io/keras_tuner/
# https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html
# https://datascience.stackexchange.com/questions/73605/opinions-on-an-lstm-hyper-parameter-tuning-process-i-am-using
# https://www.youtube.com/watch?v=vvC15l4CY1Q

def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = 128, input_length=600))
    model.add(LSTM(units=hp.Int('units_LSTM',min_value=16,max_value=256,step=16)))
    # Tune the number of dense layers
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(units=hp.Int('units_'+str(i), min_value=16, max_value=256, step=16), activation="relu"))    
        model.add(Dropout(hp.Float('rate_'+str(i), min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(5, activation='softmax'))
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'],optimizer='adam')

    return model

tuner = kt.RandomSearch(build_model,objective="val_accuracy", max_trials=5,executions_per_trial=2,overwrite=True)

tuner.search(padded_sequence_train, y_train, validation_data=(padded_sequence_vali, y_vali),
                    epochs = 3,
                    batch_size=64)

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]


Trial 5 Complete [00h 11m 17s]
val_accuracy: 0.4794444441795349

Best val_accuracy So Far: 0.5163888931274414
Total elapsed time: 08h 40m 34s
INFO:tensorflow:Oracle triggered exit


In [20]:
print(f"""
{best_hps.get('units_LSTM')}
{best_hps.get('num_layers')}
{best_hps.get('units_0')}
{best_hps.get('rate_0')}
{best_hps.get('units_1')}
{best_hps.get('rate_1')}
{best_hps.get('learning_rate')}
""")



240
2
128
0.4
16
0.0
0.01



In [15]:
'''    if hp.Boolean("dropout"):
        model.add(layers.Dropout(rate=0.25))
    model.add(Dense(units=hp.Int('units_2',min_value=32, max_value=256, step=16), activation='relu'))
'''

'    if hp.Boolean("dropout"):\n        model.add(layers.Dropout(rate=0.25))\n    model.add(Dense(units=hp.Int(\'units_2\',min_value=32, max_value=256, step=16), activation=\'relu\'))\n'

# Baseline

In [16]:
#https://www.kaggle.com/code/mkowoods/deep-learning-lstm-for-tweet-classification/notebook#Winning-architecture-for-Quora-Challenge 

# Naive Bayse Baseline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(tokenizer.sequences_to_texts_generator(padded_sequence_tv), y_tv_array)
predictions = text_clf.predict(tokenizer.sequences_to_texts_generator(padded_sequence_test)) 
print('Baseline Accuracy Using Naive Bayes: ', (predictions == y_test_array).mean())
print('F1 Score:', f1_score(y_test_array, predictions, average='weighted'))

Baseline Accuracy Using Naive Bayes:  0.43166666666666664
F1 Score: 0.376478265267993


# 5. Model

In [17]:
#https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/
#https://keras.io/api/losses/
# https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/categorical-crossentropy
model = Sequential() 
model.add(Embedding(input_dim = vocab_size, output_dim = 128, input_length=600))
model.add(LSTM(128))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='softmax')) 

model.compile(loss='categorical_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])  

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 600, 128)          2423296   
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 5)                 645       
                                                                 
Total params: 2,572,037
Trainable params: 2,572,037
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
history = model.fit(padded_sequence_tv, y_tv, validation_data=(padded_sequence_test, y_test),
                    epochs = 3,
                    batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Forecast
https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17

In [21]:
# https://keras.io/api/utils/python_utils/
y_pred = model.predict(padded_sequence_realtest)
df_pred = pd.DataFrame(y_pred, columns = [1,2,3,4,5])
df_pred = df_pred.idxmax(axis=1)

df_pred

0       4
1       2
2       4
3       5
4       4
       ..
2995    3
2996    4
2997    1
2998    5
2999    4
Length: 3000, dtype: int64