<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/Removed_Irrelevant/Annotations_BoW_Removed_Irrelevant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RtGender- Annotations - BoW Neural Net - Removed Irrelevant
---
In this notebook we build the baseline models for the RtGender dataset. 

Characteristics of the models:
- Neural Bag of Words architecture
- A single dense layer with dropout
- Use Glove embeddings (dim=300) without fine tuning them
- Maximum sequence length is 100




In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


## Package imports

In [2]:
%%capture
!pip install keras_tuner

In [31]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline
import seaborn as sns

from collections import Counter

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Dropout
from keras.models import Sequential
from tensorflow.keras.layers import TextVectorization
import tensorflow.keras.backend as K
# for hyperparameter tunning
import keras_tuner as kt
from keras_tuner import HyperModel
import keras_tuner as kt
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences


# import sklearn to calculate the metrics
from sklearn import metrics
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
print(tf.__version__)

2.6.0


In [4]:
import os
os.getcwd()

'/content'

# Load
Load training, development and test datasets. See RtGender Split and Save notebook to see approach. 

In [7]:
train_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_train.csv') #train_oversampled
dev_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_dev.csv')

print('train_shape: ',train_df.shape)
print('dev_shape: ',dev_df.shape)

train_shape:  (10746, 9)
dev_shape:  (2303, 9)


In [8]:
# remove irrelevant annotation examples
train_df  = train_df[train_df['relevance'] != 'Irrelevant']
dev_df = dev_df[dev_df['relevance'] != 'Irrelevant']

print('updated train_shape: ',train_df.shape)
print('updated dev_shape: ',dev_df.shape)

updated train_shape:  (9482, 9)
updated dev_shape:  (2042, 9)


In [9]:
nan_values = dev_df[dev_df.isna().any(axis=1)] 
print(nan_values)

# return without missing values in response_text
dev_df.dropna(subset = ["response_text"], inplace=True)

Empty DataFrame
Columns: [Unnamed: 0, source, op_gender, post_text, response_text, sentiment, relevance, label, labels_4]
Index: []


In [10]:
X_train = train_df['response_text']
y_train = train_df['labels_4']
X_dev = dev_df['response_text']
y_dev = dev_df['labels_4']

## Load GloVe 

In [None]:
# load pre-trained word embeddings. In this case Glove
# This is commented out to avoid downloading it again
!wget http://nlp.stanford.edu/data/glove.6B.zip -P ~/data/

# # unzip the file
# # commented out for the same reason above
!unzip ~/data/glove.6B.zip -d ~/data/

In [13]:
# path to glove file- will use the embeddings with dimension = 300
glove_file ="/root/data/glove.6B.300d.txt"

# embedding layer 
embeddings_index = {}
f = open(os.path.join(glove_file))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


# Preprocess

In [14]:
# tokenize data and return embeddings matrix 
max_tokens = 10000
# consider this maximum number of words
max_sequence_length = 100
tokenizer = Tokenizer(num_words=max_tokens)

#x_train, x_dev, embedding_matrix = tokenize_Xtrain_and_Xdev(X_train, X_dev)

# one-hot encodign and reshape labels
print("-"*60)

train_labels = to_categorical(np.asarray(y_train))
dev_labels = to_categorical(np.asarray(y_dev))

print('Shape of train label tensor:', train_labels.shape)
print('Shape of dev label tensor:', dev_labels.shape)

------------------------------------------------------------
Shape of train label tensor: (9482, 4)
Shape of dev label tensor: (2042, 4)


## Preprocess & Embeddings matrix

In [15]:
x_train = X_train 
x_dev = X_dev
 # note the length of the training index
train_idx = len(x_train)
#combine train and dev data and then tokenize
texts = x_train.append(x_dev)

tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Train + Dev %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=max_sequence_length)
print('Shape of data tensor:', data.shape)

x_train = data[:train_idx]
x_dev = data[train_idx:]
print('Shape of train data tensor:', x_train.shape)
print('Shape of dev data tensor:', x_dev.shape)
print("-"*60)

# print top 5 most and least common tokens
print("top 5 most common tokens: ", sorted(word_index, key=word_index.get, reverse=True)[-5:])
print("top 5 least common tokens: ", sorted(word_index, key=word_index.get, reverse=True)[:5])
print("-"*60)

# build embedding matrix to use it in the model
dimensions_emb = 300

# zero position is zero by default in keras
embedding_matrix = np.zeros((len(word_index) + 1, dimensions_emb)) 

total_tokens = len(word_index) + 1
with_embedding = []
without_embedding = []

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        with_embedding.append(word)
    else:
        without_embedding.append(word)

print(f'Number of words with embeddings found: {len(with_embedding)}')
print(f'Number of words with embeddings NOT found: {len(without_embedding)}')        



Train + Dev 18096 unique tokens.
Shape of data tensor: (11524, 100)
Shape of train data tensor: (9482, 100)
Shape of dev data tensor: (2042, 100)
------------------------------------------------------------
top 5 most common tokens:  ['and', 'you', 'i', 'to', 'the']
top 5 least common tokens:  ['grandeur', 'inadequate', 'woefully', 'lodge', 'jazzy']
------------------------------------------------------------
Number of words with embeddings found: 15510
Number of words with embeddings NOT found: 2586


## Metrics

In [16]:
def binary_loss_accuracy_plots(training_history):
    '''
    Returns plots for loss and accuracy during the training process of a NN.
    Parameters:
    training_history: object that stores the training history of the NN (from model.fit(...))
    xrange: range in x axis
    task: string used for the title in the plot
    '''
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))
    
    # summarize history for accuracy
    ax1.plot(training_history.history['binary_accuracy'])
    ax1.plot(training_history.history['val_binary_accuracy'])
    ax1.set_title('model accuracy')
    ax1.set_ylabel('accuracy')
    ax1.set_xlabel('epoch')
    ax1.legend(['train', 'dev'], loc='upper left')

    # summarize history for loss
    ax2.plot(training_history.history['loss'])
    ax2.plot(training_history.history['val_loss'])
    ax2.set_title('model loss')
    ax2.set_ylabel('loss')
    ax2.set_xlabel('epoch')
    ax2.legend(['train', 'dev'], loc='upper left')

    plt.show()
    
def multiclass_loss_accuracy_plots(training_history):
    '''
    Returns plots for loss and accuracy during the training process of a NN.
    Parameters:
    training_history: object that stores the training history of the NN (from model.fit(...))
    '''
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))
    
    # summarize history for accuracy
    ax1.plot(training_history.history['categorical_accuracy'])
    ax1.plot(training_history.history['val_categorical_accuracy'])
    ax1.set_title('model accuracy')
    ax1.set_ylabel('accuracy')
    ax1.set_xlabel('epoch')
    ax1.legend(['train', 'dev'], loc='upper left')

    # summarize history for loss
    ax2.plot(training_history.history['loss'])
    ax2.plot(training_history.history['val_loss'])
    ax2.set_title('model loss')
    ax2.set_ylabel('loss')
    ax2.set_xlabel('epoch')
    ax2.legend(['train', 'dev'], loc='upper left')

    plt.show()

In [18]:
def confusion_matrix(x_dev, original_dev_labels, class_labels):

    # identify the correct class
    max_class_idx = np.argmax(y_pred, axis = 1)
    
    # Create a confusion matrix
    cm = tf.math.confusion_matrix(original_dev_labels, max_class_idx)
    cm = cm/cm.numpy().sum(axis=1)[:, tf.newaxis]

    sns.heatmap(
        cm, annot=True,
        xticklabels=class_labels,
        yticklabels=class_labels,
        fmt='.2%', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()
    
    # print classification report
    print(metrics.classification_report(original_dev_labels, max_class_idx))
    # print global weighted f-1 score
    f1 = metrics.f1_score(original_dev_labels, max_class_idx, pos_label=1, average='weighted')*100
    print(f"Weighted f1-score: %.2f%%" %f1)


# Optimal Model Iterated

See RtGender_Annotations_Removed_Irrelevant_BoW_Neural_Net for optimal model hyperparam run

"The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 40 and the optimal learning rate for the optimizer
is 0.001."

In [39]:
iterations = 5
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            input_length=max_sequence_length,
                            trainable=False)

model = tf.keras.Sequential()
model.add(embedding_layer) 
model.add(tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1))) #avg

# Tune the number of units in the first Dense layer
# Choose an optimal value between 10 and 100
model.add(tf.keras.layers.Dense(units=40, activation='relu'))

   
# output layer 
model.add(Dense(4, activation='softmax'))

# Tune the learning rate for the optimizer
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
            loss='categorical_crossentropy', 
            metrics=[tf.keras.metrics.categorical_accuracy, 'categorical_crossentropy'])

In [40]:
from sklearn.metrics import classification_report
accuracy_list = []
weighted_f1_score_list = []
macro_f1_score_list = []
negative_f1_score = []
neutral_f1_score = []
mixed_f1_score = []
positive_f1_score = []

for i in range(iterations):
  try:
    del history
    del y_pred
    del cr
  except: pass


  history = reconstructed_model.fit(x_train, train_labels,
                      validation_data=(x_dev, dev_labels),
                      epochs=50, verbose=0)
  y_pred = reconstructed_model.predict(x_dev) #predict

  # metrics append to appropriate lists
  predictions = y_pred.argmax(1)
  cr = metrics.classification_report(y_dev, predictions, digits=3, output_dict=True)
  
  accuracy_list.append(cr.get('accuracy'))
  weighted_f1_score_list.append(cr.get('weighted avg').get("f1-score"))
  macro_f1_score_list.append(cr.get("macro avg").get("f1-score"))

  negative_f1_score.append(cr.get('0').get("f1-score"))
  neutral_f1_score.append(cr.get('1').get("f1-score"))
  positive_f1_score.append(cr.get('2').get("f1-score"))
  mixed_f1_score.append(cr.get('3').get("f1-score"))

  print(f'---------------------------Iteration {i+1} Complete---------------------------\n')


---------------------------Iteration 1 Complete---------------------------

---------------------------Iteration 2 Complete---------------------------

---------------------------Iteration 3 Complete---------------------------

---------------------------Iteration 4 Complete---------------------------

---------------------------Iteration 5 Complete---------------------------



In [42]:
import statistics

print("%15s %s (%s)" % ("","Mean", "StDev"))

print("-"*29)
print("Macro Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Accuracy",
    round(statistics.mean(accuracy_list),3),
    round(statistics.stdev(accuracy_list),3)))
print(f"%15s %5s (%s)" %("Macro F1",
    round(statistics.mean(macro_f1_score_list),3),
    round(statistics.stdev(macro_f1_score_list),3)))
print(f"%15s %5s (%s)" %("Weighted F1",
    round(statistics.mean(weighted_f1_score_list),3),
    round(statistics.stdev(weighted_f1_score_list),3)))

print("-"*29)
print("Class Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Positive",
    round(statistics.mean(positive_f1_score),3),
    round(statistics.stdev(positive_f1_score),3)))
print(f"%15s %5s (%s)" %("Neutral",
    round(statistics.mean(neutral_f1_score),3),
    round(statistics.stdev(neutral_f1_score),3)))
print(f"%15s %5s (%s)" %("Negative",
    round(statistics.mean(negative_f1_score),3),
    round(statistics.stdev(negative_f1_score),3)))
print(f"%15s %5s (%s)" %("Mixed",
    round(statistics.mean(mixed_f1_score),3),
    round(statistics.stdev(mixed_f1_score),3)))

                Mean (StDev)
-----------------------------
Macro Scores
-----------------------------
       Accuracy 0.493 (0.007)
       Macro F1 0.328 (0.007)
    Weighted F1 0.467 (0.002)
-----------------------------
Class Scores
-----------------------------
       Positive 0.687 (0.008)
        Neutral 0.249 (0.015)
       Negative 0.254 (0.017)
          Mixed 0.122 (0.016)
