In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!ls "/content/drive/MyDrive/DeepLearning/Common Literacy"

 CL_AVGW2V_POS.ipynb		     CL_CNN_Glove_TFIDF.ipynb
 CL_AVGW2V_TFIDFAVGW2V_POS.ipynb    'CommonLiteracyBertModels_VV (1).ipynb'
 CL_BOW_TFIDF_AvgW2V100D.ipynb	     CommonLiteracyBertModels_VV.ipynb
 CL_BOW_TFIDF_AvgW2V100D_SVD.ipynb   CommonLiteracy_CNN_Model.ipynb
 CL_BOW_TFIDF_AVGW2V300D.ipynb	     Dataset
 CL_CNN_BiLSTM_Glove_TFIDF.ipynb


In [3]:
train ='/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/train.csv'
test = '/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/test.csv'

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LayerNormalization, BatchNormalization
from keras.layers import Embedding, LSTM, Bidirectional
from keras.layers import Conv1D, Flatten, MaxPool1D
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

%matplotlib inline

In [5]:
df_train = pd.read_csv(train)
df_test = pd.read_csv(test)

### NLTK Stopwords Removal

In [6]:
# importing stopwords from nltk library
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopword = stopwords.words('english')
print(stopword)

stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'ag

### Cleaning the text data for any special characters and numerical characters

In [7]:
def testpreprocess(phrase):
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
    phrase = re.sub(r'\w*\d\w*', '', phrase).strip()
    return phrase

In [8]:
preprocessed_excerpt = []
for sentence in tqdm(df_train['excerpt'].values):
  sent = testpreprocess(sentence)
  sent = sent.lower().strip()
  sent = ' '.join(stemmer.stem(i) for i in sent.split(" "))
  sent = ' '.join(e for e in sent.split(" ") if e not in stopword)
  preprocessed_excerpt.append(sent)

df_train['cleaned_excerpt'] = preprocessed_excerpt

100%|██████████| 2834/2834 [00:12<00:00, 219.79it/s]


### Pretrained Glove Vector assignment

In [9]:
# Refer this for step by step method https://stackoverflow.com/questions/50060241/how-to-use-glove-word-embeddings-file-on-google-colaboratory

!wget http://nlp.stanford.edu/data/glove.6B.zip # get the glove vector from stanford library

--2021-07-19 17:51:10--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-07-19 17:51:10--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-07-19 17:51:11--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [10]:
#unzip the downloaded glove vectors to drive
!unzip glove*.zip


Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [11]:
# # creating the dictionary if word and vectors
print('Indexing word vectors.')

d_model = 100
glove_dict = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_dict[word] = coefs
f.close()

print('Found %s word vectors.' % len(glove_dict))

# checking the dictionary word shape
print(glove_dict['the'].shape)

Indexing word vectors.
Found 400000 word vectors.
(100,)


In [12]:
# creating a set of the keys from glove dictionary
glove_words = set(glove_dict.keys())

### Positional Encoding

In [13]:
import math
def positionalEncoding1DSine(max_sentence_length):
  positional_embeddings_sin = np.zeros(max_sentence_length)
  for position in range(max_sentence_length):
      positional_embeddings_sin[position] = (np.sin(position/(math.log(max_sentence_length))))
  return positional_embeddings_sin

def positionalEncoding1DCosine(max_sentence_length):
  positional_embeddings_cos = np.zeros(max_sentence_length)
  for position in range(max_sentence_length):
      positional_embeddings_cos[position] = (np.cos(position/(math.log(max_sentence_length))))
  return positional_embeddings_cos

def positionalEncoding1D(max_sentence_length):
  positional_embeddings_cos = np.zeros(max_sentence_length)
  positional_embeddings_sin = np.zeros(max_sentence_length)
  for position in range(max_sentence_length):
      positional_embeddings_cos[position] = (np.cos(position/(math.log(max_sentence_length))))
      positional_embeddings_sin[position] = (np.sin(position/(math.log(max_sentence_length))))
  positional_embeddings = np.add(positional_embeddings_cos, positional_embeddings_sin)
  return positional_embeddings

def positionalEncoding2d(max_sentence_length,d_model):
  positional_embeddings = np.zeros((max_sentence_length, d_model))
  for position in range(max_sentence_length):
      for i in range(0, d_model, 2):
        positional_embeddings[position, i] = (np.sin(position/(10000 ** ((2*i) / d_model))))
        positional_embeddings[position, i + 1] = (np.cos(position/(10000 ** ((2*(i+1)) / d_model))))
  return positional_embeddings

### Glove Average Word 2 Vec With Positional Encoding

In [14]:
def Avg_W2V_Pos(selfie): # give a pandas series into
  array_col_size = selfie.size
  AvgW2V_list = []; 
  for sentence in tqdm(selfie.values): 
      sentence = sentence.split()
      max_sentence_length = len(sentence)
      empty_vector = np.empty((0,d_model),dtype=float)
      posEmbed = positionalEncoding2d(max_sentence_length,d_model)
      for word in sentence:
          if word in glove_words: 
              vector = glove_dict[word] 
              vectorTranspose = np.reshape(vector,(-1,d_model))
              empty_vector = np.append(empty_vector,vectorTranspose,axis=0)
          else:
            vectorZeros = np.zeros((1,d_model),dtype=float)
            empty_vector = np.append(empty_vector,vectorZeros,axis=0)
      WordEmbedPOS = np.add(posEmbed, empty_vector)
      AvgW2V_list.append(WordEmbedPOS)
  # AvgW2V_Array = np.stack(AvgW2V_list, axis=0) # used to flatten the vector to one axis and convert the list of list to array
  return AvgW2V_list

### Global TF-IDF Avg Word 2 Vec with Positional Embedding

In [15]:
def Tfidf_Avg_W2V_Pos(selfie):
  tfidf_model = TfidfVectorizer()
  tfidf_model.fit(selfie)
  # we are converting a dictionary with word as a key, and the idf as a value
  dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
  tfidf_words = set(tfidf_model.get_feature_names())

  tfidf_w2v_vectors = []
  for sentence in tqdm(selfie.values): 
    sentence = sentence.split()
    max_sentence_length = len(sentence)
    empty_vector = np.empty((0,d_model),dtype=float)
    posEmbed = positionalEncoding2d(max_sentence_length,d_model)
    tf_idf_weight = 0
    for word in sentence:
      if (word in glove_words) and (word in tfidf_words):
        vector = glove_dict[word]
        tf_idf = dictionary[word]*(sentence.count(word)/len(sentence)) # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
        vector_Tfidf = (vector * tf_idf) # calculating tfidf weighted w2v
        vectorTranspose = np.reshape(vector_Tfidf,(-1,d_model))
        empty_vector = np.append(empty_vector,vectorTranspose,axis=0)
      else:
        vectorZeros = np.zeros((1,d_model),dtype=float)
        empty_vector = np.append(empty_vector,vectorZeros,axis=0)
    WordEmbedPOS = np.add(posEmbed, empty_vector)
    tfidf_w2v_vectors.append(WordEmbedPOS)
  #tfidf_w2v_vectors_Array = np.stack(tfidf_w2v_vectors, axis=0) # used to flatten the vector to one axis and convert the list of list to array
  return tfidf_w2v_vectors

### Splitting data into Train and cross validation(or test): Stratified Sampling

In [16]:
Y = df_train['target']
X = df_train['cleaned_excerpt']

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

In [18]:
print(X_train[0:3])

841     one saturday morn tarik bedilu take flock comm...
2179    time answer conductor seiz arm help step push ...
161     chang ngan wa old capit china veri great citi ...
Name: cleaned_excerpt, dtype: object


### TF-IDF processing of text

In [19]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1),min_df=5)
vectorizer.fit(X_train)
feature_names = vectorizer.get_feature_names()

X_train_excerpt_tfidf = vectorizer.transform(X_train)
X_test_excerpt_tfidf = vectorizer.transform(X_test)

print("="*100)
print("After vectorizations")
print("="*50)
print(X_train_excerpt_tfidf.shape, Y_train.shape)
print(X_test_excerpt_tfidf.shape, Y_test.shape)
print("="*100)

After vectorizations
(1898, 3905) (1898,)
(936, 3905) (936,)


In [20]:
tfidf_dict = dict(zip(vectorizer.get_feature_names(),list(vectorizer.idf_)))

### Word to Vector Representation

In [21]:
X_train_excerpt_w2v = Avg_W2V_Pos(X_train)
X_test_excerpt_w2v = Avg_W2V_Pos(X_test)

100%|██████████| 1898/1898 [00:30<00:00, 62.74it/s]
100%|██████████| 936/936 [00:15<00:00, 59.46it/s]


### TFIDF Word to Vector Representation

In [22]:
X_train_excerpt_tfidf_w2v = Tfidf_Avg_W2V_Pos(X_train)
X_test_excerpt_tfidf_w2v = Tfidf_Avg_W2V_Pos(X_test)

100%|██████████| 1898/1898 [00:31<00:00, 60.22it/s]
100%|██████████| 936/936 [00:15<00:00, 61.47it/s]


In [23]:
# adding padding to the text sequences
train_data_w2v_pos = pad_sequences(X_train_excerpt_w2v, maxlen=200)
test_data_w2v_pos = pad_sequences(X_test_excerpt_w2v,maxlen=200)

In [24]:
# adding padding to the text sequences
train_data_tfidf_w2v_pos = pad_sequences(X_train_excerpt_tfidf_w2v, maxlen=200)
test_data_tfidf_w2v_pos = pad_sequences(X_test_excerpt_tfidf_w2v,maxlen=200)

In [25]:
print(train_data_w2v_pos.shape)
print(test_data_w2v_pos.shape)

(1898, 200, 100)
(936, 200, 100)


In [26]:
print(train_data_tfidf_w2v_pos.shape)
print(test_data_tfidf_w2v_pos.shape)

(1898, 200, 100)
(936, 200, 100)


In [27]:
# Convert target to array
Y_train = np.asarray(Y_train)
Y_test = np.asarray(Y_test)

### Model Definition

In [28]:
# Vocab_Size = 25000 # this is the total number of words being sent
# Word_Embedding_Size = 100 # this value is coming from Glove Vector Size
# Input_Sequence = 150 # this is based on the padding we have given

In [29]:
# from sklearn import metrics

# Y_pred_test = regressor.predict(test_data)
# test_residuals = Y_test - Y_pred_test
# mse_test = metrics.mean_squared_error(Y_test,Y_pred_test)

In [30]:
# print("Root Mean squared Error Test:",np.sqrt(mse_test))

### Positional Embedding 1 Dimension

In [32]:
# train_data_reshape1 = np.expand_dims(train_data, axis=1)
# test_data_reshape1 = np.expand_dims(test_data, axis=1)

In [33]:
# print(train_data_reshape1.shape)

In [34]:
from keras.layers import Conv2D
from keras.layers import MaxPool2D

In [41]:
classifier = Sequential()

# Adding a first convolutional layer with Max pooling
classifier.add(Conv1D(300, 5, padding = 'same', activation = 'relu', input_shape = (200, 100))),
classifier.add(MaxPool1D(pool_size=5))
classifier.add(Dropout(0.8))

# Step 3 Adding a first convolutional layer with Max pooling
classifier.add(Conv1D(200, 5, padding = 'same', activation = 'relu')),
classifier.add(MaxPool1D(pool_size=5))
classifier.add(Dropout(0.5))

classifier.add(Bidirectional(LSTM(200)))

# Step 3 - Flattening
classifier.add(Flatten())

# Step 5 - Full connection layer with sigmoid
classifier.add(Dense(256, activation = 'relu'))
classifier.add(Dense(128, activation = 'relu'))
classifier.add(Dense(1, activation = 'linear'))

# Compiling the CNN
classifier.compile(optimizer = 'adam', loss = 'mse', metrics=['mse'])
classifier.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_8 (Conv1D)            (None, 200, 300)          150300    
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 40, 300)           0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 40, 300)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 40, 200)           300200    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 8, 200)            0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 8, 200)            0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 400)              

In [42]:
classifier.fit(train_data_tfidf_w2v_pos, Y_train, batch_size=100, epochs=100, validation_split=0.2, verbose=2)

Epoch 1/100
16/16 - 24s - loss: 1.1519 - mse: 1.1519 - val_loss: 1.0445 - val_mse: 1.0445
Epoch 2/100
16/16 - 9s - loss: 1.0386 - mse: 1.0386 - val_loss: 1.0331 - val_mse: 1.0331
Epoch 3/100
16/16 - 10s - loss: 1.0059 - mse: 1.0059 - val_loss: 1.0083 - val_mse: 1.0083
Epoch 4/100
16/16 - 10s - loss: 0.9412 - mse: 0.9412 - val_loss: 0.8536 - val_mse: 0.8536
Epoch 5/100
16/16 - 9s - loss: 0.7621 - mse: 0.7621 - val_loss: 0.8691 - val_mse: 0.8691
Epoch 6/100
16/16 - 10s - loss: 0.7010 - mse: 0.7010 - val_loss: 0.8787 - val_mse: 0.8787
Epoch 7/100
16/16 - 9s - loss: 0.6797 - mse: 0.6797 - val_loss: 0.8177 - val_mse: 0.8177
Epoch 8/100
16/16 - 9s - loss: 0.6171 - mse: 0.6171 - val_loss: 0.8531 - val_mse: 0.8531
Epoch 9/100
16/16 - 9s - loss: 0.6195 - mse: 0.6195 - val_loss: 0.7269 - val_mse: 0.7269
Epoch 10/100
16/16 - 10s - loss: 0.6033 - mse: 0.6033 - val_loss: 0.7427 - val_mse: 0.7427
Epoch 11/100
16/16 - 9s - loss: 0.5540 - mse: 0.5540 - val_loss: 0.7123 - val_mse: 0.7123
Epoch 12/100
1

<keras.callbacks.History at 0x7f627bb9e110>