In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **<span style="color:#6daa9f;">IMPORTING LIBRARIES</span>**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import re

print("Tensorflow Version :", tf.__version__)


# **<span style="color:#6daa9f;">LOADING DATA</span>**

In [None]:
# Read the data from csv file
df = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding = 'latin', header = None)

# Check the data
df.head()

# **<span style="color:#6daa9f;">DATA PREPROCESSING</span>**

The columns do not have a proper name. Rename them for convinience.

In [None]:
# Rename the columns
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']

# Check data again
df.head()

We are going to trian our data only on text. We can remove rest of the columns from the dataset.

In [None]:
# Dropping the unnecessary columns
df = df.drop(['id', 'date', 'query', 'user_id'], axis=1)

In [None]:
# Label the sentiment column
lab_to_sentiment = {0:"Negative", 4:"Positive"}
def label_decoder(label):
    return lab_to_sentiment[label]
df.sentiment = df.sentiment.apply(lambda x: label_decoder(x))
df.head()

Check the sentiment data distribution

In [None]:
val_count = df.sentiment.value_counts()

# Plot the figure
plt.figure(figsize = (4,4))
plt.bar(val_count.index, val_count.values)
plt.title('Sentiment Data Distribution')

Observe that the data is balanced

# **<span style="color:#6daa9f;">TEXT PREPROCESSING</span>**

In [None]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [None]:
def preprocess(text, stem=False):
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
df.text = df.text.apply(lambda x: preprocess(x))

In [None]:
df.head()

The data is now **cleaned**

# **<span style="color:#6daa9f;">TRAIN AND TEST SPLIT</span>**

In [None]:
# Define some constants
train_size = 0.8
max_nb_words = 100000
max_sequence_length = 30

In [None]:
# Split the data into training and testing data
train_data, test_data = train_test_split(df, test_size = 1 - train_size, random_state = 7)

In [None]:
# Check the training and testing data size
print("Train data size : ", len(train_data))
print("Test data size : ", len(test_data))

In [None]:
# Check the training data
train_data.head(10)

# **<span style="color:#6daa9f;">TOKENIZATION</span>**

In [None]:
# Creates tokens for every word and maps them to an index using dictionary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data.text)

# Contains the index for each word
word_index = tokenizer.word_index

# Represents the total number of words in the data corpus
vocab_size = len(tokenizer.word_index) + 1

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data.text), maxlen = max_sequence_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data.text), maxlen = max_sequence_length)

In [None]:
print("Traning X shape : ", X_train.shape)
print("Testing X shape : ", X_test.shape)

In [None]:
labels = train_data.sentiment.unique().tolist()

## **<span style="color:#6daa9f;">LABEL ENCODING</span>**

In [None]:
encoder = LabelEncoder()
encoder.fit(train_data.sentiment.to_list())

y_train = encoder.transform(train_data.sentiment.to_list())
y_test = encoder.transform(test_data.sentiment.to_list())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [None]:
print("y_test shape : ", y_test.shape)
print("y_train shape : ", y_train.shape)

# **<span style="color:#6daa9f;">WORD EMBEDDING</span>**

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
GLOVE_EMB = '/kaggle/working/glove.6B.300d.txt'
EMBEDDING_DIM = 300
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 10
MODEL_PATH = '.../output/kaggle/working/best_model.hdf5'

In [None]:
embeddings_index = {}
f = open(GLOVE_EMB)
for line in f:
    values = line.split()
    word = value = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' %len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, weights = [embedding_matrix], input_length = max_sequence_length, trainable = False)

# **<span style="color:#6daa9f;">MODEL TRAINING - LSTM</span>**

In [None]:
# Build the model
sequence_input = Input(shape = max_sequence_length, dtype = 'int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(65, 5, activation = 'relu')(x)
x = Bidirectional(LSTM(64, dropout = 0.2,recurrent_dropout = 0.2))(x)
x = Dense(512, activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation = 'relu')(x)
outputs = Dense(1, activation = 'sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)

In [None]:
# Compile the model
model.compile(optimizer = Adam(learning_rate = LR),
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])
ReduceLROnPlateau = ReduceLROnPlateau(factor = 0.1,
                                   min_lr = 0.01,
                                   monitor = 'val_loss',
                                   verbose = 1)

In [None]:
# Fit the model
history = model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_data = (X_test, y_test),
                   callbacks = [ReduceLROnPlateau])

# **<span style="color:#6daa9f;">MODEL EVALUATION</span>**

Plot the learning curve of loss and accuracy with each epoch

In [None]:
s, (at,al) = plt.subplots(2,1)
at.plot(history.history['accuracy'],c = 'b')
at.plot(history.history['val_accuracy'], c = 'r')
at.set_title('Model accuracy')
at.set_ylabel('Accuracy')
at.set_xlabel('Epoch')
at.legend(['LSTM_train', 'LSTM_val'], loc = 'upper left')

al.plot(history.history['loss'], c='m')
al.plot(history.history['val_loss'], c='c')
al.set_title('Model loss')
al.set_ylabel('Loss')
al.set_xlabel('Epoch')
al.legend(['train','val'], loc = 'upper left')

The model will output a prediction score between 0 and 1. We can classiy two classes by defining a threshold value for it.
In this case, we are going to set the threshold vaule to 0.5. If the score is **above 0.5**, then it will be classified as **positive** sentiment.

In [None]:
def decode_sentiment(score):
    return 'Positive' if score > 0.5 else 'Negative'

In [None]:
scores = model.predict(X_test, verbose = 1, batch_size = 10000)
y_pred_1d = [decode_sentiment(score) for score in scores ]

Confusion Matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=13)
    plt.yticks(tick_marks, classes, fontsize=13)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=17)
    plt.xlabel('Predicted label', fontsize=17)

In [None]:
cnf_matrix = confusion_matrix(test_data.sentiment.to_list(), y_pred_1d)
plt.figure(figsize=(6,6))
plot_confusion_matrix(cnf_matrix, classes=test_data.sentiment.unique(), title="Confusion matrix")
plt.show()

In [None]:
print(classification_report(list(test_data.sentiment), y_pred_1d))