In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!ls "/content/drive/MyDrive/DeepLearning/Common Literacy"

CL_Bert_Base_Uncased.ipynb
CL_CNN_BiLSTM_Glove100d_TFIDF.ipynb
CL_CNN_Glove100d_TFIDF.ipynb
CL_CNN_Model.ipynb
CL_Glove100d_AVGW2V_POS.ipynb
CL_Glove100d_AVGW2V_TFIDFAVGW2V_POS.ipynb
CL_Glove100d_BOW_TFIDF_AvgW2V.ipynb
CL_Glove100d_BOW_TFIDF_AvgW2V_SVD.ipynb
CL_Glove100d_FastText_AVGW2V_POS.ipynb
CL_Glove300d_BOW_TFIDF_AVGW2V.ipynb
Dataset
FastText
GloveVector


In [3]:
train ='/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/train.csv'
test = '/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/test.csv'

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LayerNormalization, BatchNormalization
from keras.layers import Embedding, LSTM, Bidirectional
from keras.layers import Conv1D, Flatten, MaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

%matplotlib inline

In [5]:
df_train = pd.read_csv(train)
df_test = pd.read_csv(test)

### NLTK Stopwords Removal

In [6]:
# importing stopwords from nltk library
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopword = stopwords.words('english')
print(stopword)

stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',

### Cleaning the text data for any special characters and numerical characters

In [7]:
def testpreprocess(phrase):
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
    phrase = re.sub(r'\w*\d\w*', '', phrase).strip()
    return phrase

In [8]:
preprocessed_excerpt = []
for sentence in tqdm(df_train['excerpt'].values):
  sent = testpreprocess(sentence)
  sent = sent.lower().strip()
  sent = ' '.join(stemmer.stem(i) for i in sent.split(" "))
  sent = ' '.join(e for e in sent.split(" ") if e not in stopword)
  preprocessed_excerpt.append(sent)

df_train['cleaned_excerpt'] = preprocessed_excerpt

100%|██████████| 2834/2834 [00:10<00:00, 277.69it/s]


### Pretrained Glove Vector assignment

In [9]:
# Refer this for step by step method https://stackoverflow.com/questions/50060241/how-to-use-glove-word-embeddings-file-on-google-colaboratory

# !wget http://nlp.stanford.edu/data/glove.6B.zip # get the glove vector from stanford library

In [10]:
#unzip the downloaded glove vectors to drive
# !unzip glove*.zip


In [11]:
# creating the dictionary if word and vectors
print('Indexing word vectors.')

d_model = 300
glove_dict = {}
f = open('/content/drive/MyDrive/DeepLearning/Common Literacy/GloveVector/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_dict[word] = coefs
f.close()

print('Found %s word vectors.' % len(glove_dict))

# checking the dictionary word shape
print(glove_dict['the'].shape)

Indexing word vectors.
Found 400000 word vectors.
(300,)


In [12]:
# creating a set of the keys from glove dictionary
glove_words = set(glove_dict.keys())

### Pretrained Fast Text Vectors

In [13]:
# https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip

# creating the dictionary if word and vectors
print('Indexing word vectors.')

fasttext_dict = {}
f = open('/content/drive/MyDrive/DeepLearning/Common Literacy/FastText/wiki.simple.vec', encoding='utf-8')
for line in f:
  values = line.strip().rsplit(' ')
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  fasttext_dict[word] = coefs
f.close()

print('Found %s word vectors.' % len(fasttext_dict))

# checking the dictionary word shape
print(fasttext_dict['the'].shape)

Indexing word vectors.
Found 111024 word vectors.
(300,)


In [14]:
# creating a set of the keys from glove dictionary
fasttext_words = set(fasttext_dict.keys())

### Positional Encoding

In [15]:
import math
def positionalEncoding1DSine(max_sentence_length):
  positional_embeddings_sin = np.zeros(max_sentence_length)
  for position in range(max_sentence_length):
      positional_embeddings_sin[position] = (np.sin(position/(math.log(max_sentence_length))))
  return positional_embeddings_sin

def positionalEncoding1DCosine(max_sentence_length):
  positional_embeddings_cos = np.zeros(max_sentence_length)
  for position in range(max_sentence_length):
      positional_embeddings_cos[position] = (np.cos(position/(math.log(max_sentence_length))))
  return positional_embeddings_cos

def positionalEncoding1D(max_sentence_length):
  positional_embeddings_cos = np.zeros(max_sentence_length)
  positional_embeddings_sin = np.zeros(max_sentence_length)
  for position in range(max_sentence_length):
      positional_embeddings_cos[position] = (np.cos(position/(math.log(max_sentence_length))))
      positional_embeddings_sin[position] = (np.sin(position/(math.log(max_sentence_length))))
  positional_embeddings = np.add(positional_embeddings_cos, positional_embeddings_sin)
  return positional_embeddings

def positionalEncoding2d(max_sentence_length,d_model):
  positional_embeddings = np.zeros((max_sentence_length, d_model))
  for position in range(max_sentence_length):
      for i in range(0, d_model, 2):
        positional_embeddings[position, i] = (np.sin(position/(10000 ** ((2*i) / d_model))))
        positional_embeddings[position, i + 1] = (np.cos(position/(10000 ** ((2*(i+1)) / d_model))))
  return positional_embeddings

### Glove Average Word 2 Vec With Glove Positional Encoding

In [16]:
def Avg_W2V_Pos(selfie): # give a pandas series into
  array_col_size = selfie.size
  AvgW2V_list = []; 
  for sentence in tqdm(selfie.values): 
      sentence = sentence.split()
      max_sentence_length = len(sentence)
      empty_vector = np.empty((0,d_model),dtype=float)
      posEmbed = positionalEncoding2d(max_sentence_length,d_model)
      for word in sentence:
          if word in glove_words: 
              vector = glove_dict[word] 
              vectorTranspose = np.reshape(vector,(-1,d_model))
              empty_vector = np.append(empty_vector,vectorTranspose,axis=0)
          else:
            vectorZeros = np.zeros((1,d_model),dtype=float)
            empty_vector = np.append(empty_vector,vectorZeros,axis=0)
      WordEmbedPOS = np.add(posEmbed, empty_vector)
      AvgW2V_list.append(WordEmbedPOS)
  # AvgW2V_Array = np.stack(AvgW2V_list, axis=0) # used to flatten the vector to one axis and convert the list of list to array
  return AvgW2V_list

### Glove Average Word 2 Vec With Glove and FastText Positional Encoding

In [17]:
def Avg_W2V_Pos_Concat(selfie): # give a pandas series into
  array_col_size = selfie.size
  AvgW2V_list = []; 
  for sentence in tqdm(selfie.values): 
      sentence = sentence.split()
      max_sentence_length = len(sentence)
      empty_vector_glove = np.empty((0,d_model),dtype=float)
      empty_vector_fasttext = np.empty((0,d_model),dtype=float)
      posEmbed = positionalEncoding2d(max_sentence_length,d_model)
      for word in sentence:
          if word in glove_words: 
              vector_glove = glove_dict[word] 
              vectorTranspose_glove = np.reshape(vector_glove,(-1,d_model))
              empty_vector_glove = np.append(empty_vector_glove,vectorTranspose_glove,axis=0)
          else:
            vectorZeros_glove = np.zeros((1,d_model),dtype=float)
            empty_vector_glove = np.append(empty_vector_glove,vectorZeros_glove,axis=0)

          if word in fasttext_words: 
              vector_fasttext = fasttext_dict[word] 
              vectorTranspose_fasttext = np.reshape(vector_fasttext,(-1,d_model))
              empty_vector_fasttext = np.append(empty_vector_fasttext,vectorTranspose_fasttext,axis=0)
          else:
            vectorZeros_fasttext = np.zeros((1,d_model),dtype=float)
            empty_vector_fasttext = np.append(empty_vector_fasttext,vectorZeros_fasttext,axis=0)
      
      WordEmbed_Glove = np.add(posEmbed, empty_vector_glove)
      WordEmbed_Fasttext = np.add(posEmbed, empty_vector_fasttext)
      WordEmbedPOS = np.hstack((WordEmbed_Glove, WordEmbed_Fasttext))

      AvgW2V_list.append(WordEmbedPOS)
  # AvgW2V_Array = np.stack(AvgW2V_list, axis=0) # used to flatten the vector to one axis and convert the list of list to array
  return AvgW2V_list

### Global TF-IDF Avg Word 2 Vec with Glove Positional Embedding

In [18]:
def Tfidf_Avg_W2V_Pos(selfie):
  tfidf_model = TfidfVectorizer()
  tfidf_model.fit(selfie)
  # we are converting a dictionary with word as a key, and the idf as a value
  dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
  tfidf_words = set(tfidf_model.get_feature_names())

  tfidf_w2v_vectors = []
  for sentence in tqdm(selfie.values): 
    sentence = sentence.split()
    max_sentence_length = len(sentence)
    empty_vector = np.empty((0,d_model),dtype=float)
    posEmbed = positionalEncoding2d(max_sentence_length,d_model)
    tf_idf_weight = 0
    for word in sentence:
      if (word in glove_words) and (word in tfidf_words):
        vector = glove_dict[word]
        tf_idf = dictionary[word]*(sentence.count(word)/len(sentence)) # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
        vector_Tfidf = (vector * tf_idf) # calculating tfidf weighted w2v
        vectorTranspose = np.reshape(vector_Tfidf,(-1,d_model))
        empty_vector = np.append(empty_vector,vectorTranspose,axis=0)
      else:
        vectorZeros = np.zeros((1,d_model),dtype=float)
        empty_vector = np.append(empty_vector,vectorZeros,axis=0)
    WordEmbedPOS = np.add(posEmbed, empty_vector)
    tfidf_w2v_vectors.append(WordEmbedPOS)
  #tfidf_w2v_vectors_Array = np.stack(tfidf_w2v_vectors, axis=0) # used to flatten the vector to one axis and convert the list of list to array
  return tfidf_w2v_vectors

### Splitting data into Train and cross validation(or test): Stratified Sampling

In [19]:
Y = df_train['target']
# X = df_train['cleaned_excerpt']
X = df_train['excerpt']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

In [21]:
print(X_train[0:3])

1371    Then the causes of the war are summed up and t...
1712    The tube that communicates with the vessel, F,...
33      Katie Haydon is going to London, ma'am. Did sh...
Name: excerpt, dtype: object


### Word to Vector Representation

In [22]:
X_train_excerpt_w2v = Avg_W2V_Pos(X_train)
X_test_excerpt_w2v = Avg_W2V_Pos(X_test)

100%|██████████| 1898/1898 [02:15<00:00, 13.96it/s]
100%|██████████| 936/936 [01:07<00:00, 13.90it/s]


### Word to Vector Representation with Glove Fasttext Concat

In [23]:
X_train_excerpt_w2v_concat = Avg_W2V_Pos_Concat(X_train)
X_test_excerpt_w2v_concat = Avg_W2V_Pos_Concat(X_test)

100%|██████████| 1898/1898 [02:28<00:00, 12.79it/s]
100%|██████████| 936/936 [01:14<00:00, 12.56it/s]


### TFIDF Word to Vector Representation

In [24]:
X_train_excerpt_tfidf_w2v = Tfidf_Avg_W2V_Pos(X_train)
X_test_excerpt_tfidf_w2v = Tfidf_Avg_W2V_Pos(X_test)

100%|██████████| 1898/1898 [02:20<00:00, 13.56it/s]
100%|██████████| 936/936 [01:08<00:00, 13.65it/s]


### Padding Sequences

In [25]:
# adding padding to the text sequences
train_data_w2v_pos = pad_sequences(X_train_excerpt_w2v, maxlen=200)
test_data_w2v_pos = pad_sequences(X_test_excerpt_w2v,maxlen=200)

In [26]:
# adding padding to the text sequences
train_data_w2v_pos_concat = pad_sequences(X_train_excerpt_w2v_concat, maxlen=200)
test_data_w2v_pos_concat = pad_sequences(X_test_excerpt_w2v_concat,maxlen=200)

In [27]:
# adding padding to the text sequences
train_data_tfidf_w2v_pos = pad_sequences(X_train_excerpt_tfidf_w2v, maxlen=200)
test_data_tfidf_w2v_pos = pad_sequences(X_test_excerpt_tfidf_w2v,maxlen=200)

In [28]:
print(train_data_w2v_pos.shape)
print(test_data_w2v_pos.shape)

(1898, 200, 300)
(936, 200, 300)


In [29]:
print(train_data_w2v_pos_concat.shape)
print(test_data_w2v_pos_concat.shape)

(1898, 200, 600)
(936, 200, 600)


In [30]:
print(train_data_tfidf_w2v_pos.shape)
print(test_data_tfidf_w2v_pos.shape)

(1898, 200, 300)
(936, 200, 300)


In [31]:
# Convert target to array
Y_train = np.asarray(Y_train)
Y_test = np.asarray(Y_test)

### Model Definition

In [32]:
# Vocab_Size = 25000 # this is the total number of words being sent
# Word_Embedding_Size = 100 # this value is coming from Glove Vector Size
# Input_Sequence = 150 # this is based on the padding we have given

In [33]:
# from sklearn import metrics

# Y_pred_test = regressor.predict(test_data)
# test_residuals = Y_test - Y_pred_test
# mse_test = metrics.mean_squared_error(Y_test,Y_pred_test)

In [34]:
# print("Root Mean squared Error Test:",np.sqrt(mse_test))

### Convolution Neural Network 1D

In [41]:
classifier = Sequential()

# Adding a first convolutional layer with Max pooling
classifier.add(Conv1D(1800, 5, padding = 'same', activation = 'relu', input_shape = (200, 600))),
classifier.add(AveragePooling1D(pool_size=5))
classifier.add(Dropout(0.8))

# MaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D

# Step 3 Adding a first convolutional layer with Max pooling
classifier.add(Conv1D(600, 5, padding = 'same', activation = 'relu')),
classifier.add(AveragePooling1D(pool_size=5))
classifier.add(Dropout(0.5))

classifier.add(Bidirectional(LSTM(600)))

# Step 3 - Flattening
classifier.add(Flatten())

# Step 5 - Full connection layer with sigmoid
classifier.add(Dense(256, activation = 'relu'))
classifier.add(Dense(128, activation = 'relu'))
classifier.add(Dense(1, activation = 'linear'))

# Compiling the CNN
classifier.compile(optimizer = 'adam', loss = 'mse', metrics=['mse'])
classifier.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_6 (Conv1D)            (None, 200, 1800)         5401800   
_________________________________________________________________
average_pooling1d_3 (Average (None, 40, 1800)          0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 40, 1800)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 40, 600)           5400600   
_________________________________________________________________
average_pooling1d_4 (Average (None, 8, 600)            0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 8, 600)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 1200)             

In [43]:
classifier.fit(train_data_w2v_pos_concat, Y_train, batch_size=200, epochs=500, validation_split=0.2, verbose=2)

Epoch 1/500
8/8 - 2s - loss: 0.4553 - mse: 0.4553 - val_loss: 0.5471 - val_mse: 0.5471
Epoch 2/500
8/8 - 2s - loss: 0.4563 - mse: 0.4563 - val_loss: 0.4858 - val_mse: 0.4858
Epoch 3/500
8/8 - 2s - loss: 0.4005 - mse: 0.4005 - val_loss: 0.4902 - val_mse: 0.4902
Epoch 4/500
8/8 - 2s - loss: 0.4070 - mse: 0.4070 - val_loss: 0.5185 - val_mse: 0.5185
Epoch 5/500
8/8 - 2s - loss: 0.3799 - mse: 0.3799 - val_loss: 0.4790 - val_mse: 0.4790
Epoch 6/500
8/8 - 2s - loss: 0.3740 - mse: 0.3740 - val_loss: 0.4772 - val_mse: 0.4772
Epoch 7/500
8/8 - 2s - loss: 0.3556 - mse: 0.3556 - val_loss: 0.5701 - val_mse: 0.5701
Epoch 8/500
8/8 - 2s - loss: 0.4044 - mse: 0.4044 - val_loss: 0.4965 - val_mse: 0.4965
Epoch 9/500
8/8 - 2s - loss: 0.3566 - mse: 0.3566 - val_loss: 0.4890 - val_mse: 0.4890
Epoch 10/500
8/8 - 2s - loss: 0.3504 - mse: 0.3504 - val_loss: 0.5551 - val_mse: 0.5551
Epoch 11/500
8/8 - 2s - loss: 0.3761 - mse: 0.3761 - val_loss: 0.4730 - val_mse: 0.4730
Epoch 12/500
8/8 - 2s - loss: 0.3341 - ms

<keras.callbacks.History at 0x7ff5084afe50>