# Twitter Sentiment Analysis

# **1 Import Libraries/Dataset - Adding GPU**

# 1.1 Use GPU - Check IF GPU Available

In [None]:
# Libraries For Basix Processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Libraries For NLTK Libraries
import nltk
import string
import re

# Libraries For Sentence Tokenization  - Tokenizes sentences from text
from nltk.tokenize import sent_tokenize
# Libraries For Sentence Word Tokenization  - Tokenizes words in sentences
from nltk.tokenize import word_tokenize

# Libraries For Removal of stop words from the text
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Libraries For Lemmatization
from nltk.stem import WordNetLemmatizer
# Stemming
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from wordcloud import WordCloud
import itertools
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords

# tensor flow related Libraries
import tensorflow as tf
import tensorflow_datasets as tdfs
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras import models
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten

import seaborn as sns
print("Tensor Version",tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

In [None]:
!nvidia-smi

In [None]:
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

# 1.3 Import Data Set

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/My Drive/')

In [None]:
df = pd.read_csv('training.csv', encoding = 'latin',header=None)
df.head()
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
df.head()

# 2  Data Visualizations

**Print Distribution of Sentiment Data**

In [None]:
df['sentiment'].value_counts()

In [None]:
lab_to_sentiment = {0:"Negative",4:"Positive"}

def label_decoder(label):
  return lab_to_sentiment[label]

df['class_label'] = df.sentiment.apply(lambda x:label_decoder(x))

# 2.2  Plot Bar Graph - Of Sentiment Distributions

In [None]:
val_counts = df['class_label'].value_counts()

plt.figure(figsize=(8,4))
plt.bar(val_counts.index,val_counts.values)
plt.xlabel("Class Label")
plt.ylabel("Value Counts")
plt.title("Sentiment Data Distribution")

# 3 Data Pre Processing

#### Print atleast 2 Rows From Each Class

In [None]:
print("================================================")
for index,row in df[df['sentiment']==0].head(5).iterrows():
  print(label_decoder(row.sentiment),"-> { ",row.text," }")
print("================================================")
for index,row in df[df['sentiment']==4].head(5).iterrows():
  print(label_decoder(row.sentiment),"-> { ",row.text," }")
print("================================================")

#### Remove Stop Words , Remove Special Characters - Links , Lemmatize

In [None]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [None]:
def preprocess(text,stem=False):
  text = re.sub(text_cleaning_re,' ',str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [None]:
df.text = df.text.apply(lambda x:preprocess(x,False))

In [None]:
sentiments=df['sentiment'].values
sentences=df['text'].values

In [None]:
sentiments[sentiments==4]=1

In [None]:
sentiments[1],sentences[1]

In [None]:
total=len(df)

In [None]:
split=int(0.95*total)

In [None]:
split

In [None]:
test_sentiments=sentiments[split:]
test_sentences=sentences[split:]

In [None]:
batch_size=2000
train = tf.data.Dataset.from_tensor_slices((tf.constant(sentences),tf.constant(sentiments)))
# train = train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train = train.cache().shuffle(total)
train = train.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
example_input_batch, example_target_batch = next(iter(train))
example_input_batch[-1], example_target_batch[-1]

In [None]:
example_input_batch[0], example_target_batch[0]

In [None]:
batch_size=1000
test = tf.data.Dataset.from_tensor_slices((tf.constant(test_sentences),tf.constant(test_sentiments)))
# test = test.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test = test.cache().shuffle(len(test_sentences))
test = test.batch(batch_size)

In [None]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1", output_shape=[128],
                           input_shape=[], dtype=tf.string, trainable=True)

# 4) Model-1 DNN With Regularizer and Drop Out

#### Model Building

In [None]:
from keras.regularizers import l2
from tensorflow.keras import regularizers

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Lambda(lambda x: tf.expand_dims(x,1)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(160,return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(160)))
model.add(tf.keras.layers.Dense(160, activation='relu',kernel_regularizer=regularizers.l2(0.001)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(30, activation='relu',kernel_regularizer=regularizers.l2(0.001)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(2, activation='softmax'))

In [None]:
model.summary()

#### Model Compilation

In [None]:
model.compile(optimizer = tf.optimizers.Adam(learning_rate=0.001), 
              loss = 'sparse_categorical_crossentropy', 
              metrics = ['accuracy'])

#### Model Training

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
filepath="cp/sentiment_model.Grp160"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

!mkdir cp

callbacks_list = [checkpoint]

In [None]:
history = model.fit(train, epochs=5,validation_data=test, callbacks=callbacks_list)

#### Model Evaluation - Print Train/Test Accuracy

In [None]:
print('Final training loss \t', history.history['loss'][-1])
print('Final training accuracy ', history.history['accuracy'][-1])
loss=history.history['loss']
v_loss=history.history['val_loss']

In [None]:
# Generate generalization metrics
score = model.evaluate(test_sentences, test_sentiments, verbose=0)
print(f'Test loss for Keras ReLU : {score[0]} / Test accuracy: {score[1]}')

In [None]:
# Visualize model history
plt.plot(history.history['accuracy'], label='Training accuracy')
plt.plot(history.history['val_accuracy'], label='Validation accuracy')
plt.title('Basic Model ReLU training / validation accuracies')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc="upper left")
plt.show()

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.title(' ReLU training / validation loss values')
plt.ylabel('Loss value')
plt.xlabel('Epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
#Defining function for confusion matrix plot
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):

    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    #Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

#print(cm)

    fig, ax = plt.subplots(figsize=(7,7))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')


    #Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

np.set_printoptions(precision=2)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
# Predict the values from the validation dataset

y_pred=model.predict_classes(test_sentences)
y_true=np.argmax(test_sentiments)

# compute the confusion matrix
#confusion_mtx = confusion_matrix(y_true,y_pred) 
class_names=['Positive',
'Negative']

plot_confusion_matrix(y_true, y_pred, classes = class_names, title='Confusion matrix, without Regularization')



# 5) Model-2 : DNN With Different Batch Size

#### Model Building

In [None]:
from keras.regularizers import l2
from tensorflow.keras import regularizers

model2 = tf.keras.Sequential()
model2.add(hub_layer)
model2.add(tf.keras.layers.Lambda(lambda x: tf.expand_dims(x,1)))
model2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(160,return_sequences=True)))
model2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(160)))
model2.add(tf.keras.layers.Dense(160, activation='relu',kernel_regularizer=regularizers.l2(0.001)))
model2.add(tf.keras.layers.Dropout(0.2))
model2.add(tf.keras.layers.Dense(30, activation='relu',kernel_regularizer=regularizers.l2(0.001)))
model2.add(tf.keras.layers.Dropout(0.2))
model2.add(tf.keras.layers.Dense(2, activation='softmax'))

#### Model Compilation

In [None]:
model2.compile(optimizer = tf.optimizers.Adam(learning_rate=0.001), 
              loss = 'sparse_categorical_crossentropy', 
              metrics = ['accuracy'])

#### Model Training

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
filepath="cp/sentiment_model_2.Grp160"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

!mkdir cp

callbacks_list = [checkpoint]

In [None]:
history_m2 = model2.fit(train, batch_size=128,epochs=5,validation_data=test, callbacks=callbacks_list)

#### Model Evaluation - Print Train/Test Accuracy

In [None]:
print('Final training loss \t', history_m2.history['loss'][-1])
print('Final training accuracy ', history_m2.history['accuracy'][-1])
loss=history_m2.history['loss']
v_loss=history_m2.history['val_loss']

In [None]:
# Generate generalization metrics
score = model2.evaluate(test_sentences, test_sentiments, verbose=0)
print(f'Test loss for Keras ReLU : {score[0]} / Test accuracy: {score[1]}')

In [None]:
# Visualize model history
plt.plot(history_m2.history['accuracy'], label='Training accuracy')
plt.plot(history_m2.history['val_accuracy'], label='Validation accuracy')
plt.title('Basic Model ReLU training / validation accuracies')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc="upper left")
plt.show()

plt.plot(history_m2.history['loss'], label='Training loss')
plt.plot(history_m2.history['val_loss'], label='Validation loss')
plt.title(' ReLU training / validation loss values')
plt.ylabel('Loss value')
plt.xlabel('Epoch')
plt.legend(loc="upper left")
plt.show()

# 6) Model-3 : DNN With Different Optimizer and Learning Rate

#### Model Building

In [None]:
from keras.regularizers import l2
from tensorflow.keras import regularizers

model3 = tf.keras.Sequential()
model3.add(hub_layer)
model3.add(tf.keras.layers.Lambda(lambda x: tf.expand_dims(x,1)))
model3.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(160,return_sequences=True)))
model3.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(160)))
model3.add(tf.keras.layers.Dense(160, activation='relu'))
model3.add(tf.keras.layers.Dense(30, activation='relu'))
model3.add(tf.keras.layers.Dense(2, activation='softmax'))

#### Model Compilation

In [None]:
model3.compile(optimizer = tf.optimizers.RMSprop(learning_rate=0.001), 
              loss = 'sparse_categorical_crossentropy', 
              metrics = ['accuracy'])

#### Model Training

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
filepath="cp/sentiment_model_3.Grp160"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

!mkdir cp

callbacks_list = [checkpoint]

In [None]:
history_m3 = model3.fit(train, batch_size=128,epochs=5,validation_data=test, callbacks=callbacks_list)

#### Model Evaluation

In [None]:
print('Final training loss \t', history_m3.history['loss'][-1])
print('Final training accuracy ', history_m3.history['accuracy'][-1])
loss=history_m3.history['loss']
v_loss=history_m3.history['val_loss']

In [None]:
# Generate generalization metrics
score = model3.evaluate(test_sentences, test_sentiments, verbose=0)
print(f'Test loss for Keras ReLU : {score[0]} / Test accuracy: {score[1]}')

In [None]:
# Visualize model history
plt.plot(history_m3.history['accuracy'], label='Training accuracy')
plt.plot(history_m3.history['val_accuracy'], label='Validation accuracy')
plt.title('Basic Model ReLU training / validation accuracies')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc="upper left")
plt.show()

plt.plot(history_m3.history['loss'], label='Training loss')
plt.plot(history_m3.history['val_loss'], label='Validation loss')
plt.title(' ReLU training / validation loss values')
plt.ylabel('Loss value')
plt.xlabel('Epoch')
plt.legend(loc="upper left")
plt.show()

# 7) Model Comparision

The Model Accuracies are as FOllows:
    

   
    
    
    Model 1 :
             Train Accuracy => 88.15%
             Train Loss => 0.2736
             Test Accuracy => 89.05%
             Test Loss => 0.2592
    Model 2 :
             Train Accuracy => 89.55%
             Train Loss => 0.2424
             Test Accuracy => 89.8%
             Test Loss => 0.2114


    Model 3 :
             Train Accuracy => 89.91%
             Train Loss => 0.2383
             Test Accuracy => 90.49
             Test Loss => 0.2179

  As per the Model Statistics The Model3 Performs Better than Model 1,2.
  i.e. The model with RmsProp, with Learning Rate 0.001, and batch size 0f 128 performs better than other 2 models.