In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!ls "/content/drive/MyDrive/DeepLearning/Common Literacy"

CL_Bert_Base_Uncased.ipynb
CL_CNN_BiLSTM_Glove100d_TFIDF.ipynb
CL_CNN_Glove_FastText_300d_AvgW2V_TFIDF.ipynb
CL_CNN_Model.ipynb
CL_Elmo.ipynb
CL_Glove100d_AVGW2V_POS.ipynb
CL_Glove100d_AVGW2V_TFIDFAVGW2V_POS.ipynb
CL_Glove100d_BOW_TFIDF_AvgW2V.ipynb
CL_Glove100d_BOW_TFIDF_AvgW2V_SVD.ipynb
CL_Glove300d_BOW_TFIDF_AVGW2V.ipynb
CL_Glove_FastText_300d_AVGW2V_POS.ipynb
Dataset
FastText
GloveVector


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from mlxtend.regressor import StackingRegressor

# Gensim Models word2vec
import gensim
from gensim.models import Word2Vec
%matplotlib inline

In [None]:
train ='/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/train.csv'
test = '/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/test.csv'

In [None]:
df_train = pd.read_csv(train)
df_test = pd.read_csv(test)

In [None]:
print("This is train dataset")
print(df_train.head(2))
print(df_train.columns)
print("="*100)
print("This is test dataset")
print(df_test.head(2))
print(df_test.columns)

This is train dataset
          id url_legal  ...    target standard_error
0  c12129c31       NaN  ... -0.340259       0.464009
1  85aa80a4c       NaN  ... -0.315372       0.480805

[2 rows x 6 columns]
Index(['id', 'url_legal', 'license', 'excerpt', 'target', 'standard_error'], dtype='object')
This is test dataset
          id  ...                                            excerpt
0  c0f722661  ...  My hope lay in Jack's promise that he would ke...
1  f0953f0a5  ...  Dotty continued to go to Mrs. Gray's every nig...

[2 rows x 4 columns]
Index(['id', 'url_legal', 'license', 'excerpt'], dtype='object')


### Cleaning the text data for any special characters and numerical characters

In [None]:
# importing stopwords from nltk library
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopword = stopwords.words('english')
print(stopword)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'ag

In [None]:
def testpreprocess(phrase):
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
    phrase = re.sub(r'\w*\d\w*', '', phrase).strip()
    return phrase

In [None]:
preprocessed_excerpt = []
for sentence in tqdm(df_train['excerpt'].values):
  sent = testpreprocess(sentence)
  sent = sent.lower().strip()
  sent = ' '.join(e for e in sent.split(" ") if e not in stopword)
  preprocessed_excerpt.append(sent)

df_train['cleaned_excerpt'] = preprocessed_excerpt

100%|██████████| 2834/2834 [00:01<00:00, 1932.76it/s]


In [None]:
print(df_train['excerpt'].values[0])
print("="*100)
print(df_train['cleaned_excerpt'].values[0])

When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.
The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.
At each end of the room, on the wall, hung a beautiful bear-skin rug.
These rugs were for prizes, one for the girls and one for the boys. And this was the game.
The girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.
This would have been an easy matter, but each traveller was obliged to wear snowshoes.
young p

### Elmo Word Embedding Trainings

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import logging
tf.get_logger().setLevel(logging.ERROR)
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
  
# Load pre trained ELMo model
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [None]:
# # Extract ELMo features

# x = df_train['cleaned_excerpt']
# x = x.tolist()

In [None]:
# print(x[0])

In [None]:
# Elmo with 2 dimension array
def elmo_vectors_2D(x):
    embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        return sess.run(embeddings)

In [None]:
# elmo_train_x = elmo_vectors_array(x)
# print(elmo_train_x.shape)

In [None]:
# Elmo with 1 dimension array
def elmo_vectors_1D(x):
    embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        return sess.run(tf.reduce_mean(embeddings,1))

In [None]:
# elmo_train = elmo_vectors(x)
# print(elmo_train.shape)

In [None]:
# for i in df_train['cleaned_excerpt'][0:5]:
#   i = [i]
#   print(i)
#   print(type(i))
#   elmo_x = elmo_vectors_array(i)
#   print(elmo_x.shape)


### Splitting data into Train and cross validation(or test): Stratified Sampling


In [None]:
Y = df_train['target']
X = df_train['cleaned_excerpt']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

In [None]:
print(X_train.head(5))

1475    peter dreadfully frightened rushed garden forg...
424     instant replay video reproduction something re...
139     gwen went back school feeling rather tamed sob...
20      man took hat walked away philip sister went ho...
237     seemed sam whole country around far one could ...
Name: cleaned_excerpt, dtype: object


### Elmo Word Embedding

In [30]:

X_train_list = []
for i in tqdm(X_train[0:20]):
  i = [i]
  i_elmo = elmo_vectors_1D(i)
  X_train_list.append(i_elmo)
# X_train_Elmo = elmo_vectors_1D(X_train)
# X_test_Elmo = elmo_vectors_1D(X_test)

# print("="*100)
# print("After vectorizations")
# print("="*50)
# print(X_train_Elmo.shape, Y_train.shape)
# print(X_train_Elmo.shape, Y_test.shape)
# print("="*100)



  0%|          | 0/20 [00:00<?, ?it/s][A[A

  5%|▌         | 1/20 [00:37<11:47, 37.25s/it][A[A

 10%|█         | 2/20 [01:14<11:11, 37.33s/it][A[A

 15%|█▌        | 3/20 [01:52<10:35, 37.39s/it][A[A

 20%|██        | 4/20 [02:30<10:00, 37.52s/it][A[A

 25%|██▌       | 5/20 [03:07<09:23, 37.58s/it][A[A

 30%|███       | 6/20 [03:45<08:47, 37.65s/it][A[A

 35%|███▌      | 7/20 [04:23<08:10, 37.76s/it][A[A

 40%|████      | 8/20 [05:01<07:34, 37.91s/it][A[A

 45%|████▌     | 9/20 [05:40<06:58, 38.06s/it][A[A

 50%|█████     | 10/20 [06:19<06:22, 38.27s/it][A[A

 55%|█████▌    | 11/20 [06:57<05:45, 38.42s/it][A[A

 60%|██████    | 12/20 [07:36<05:08, 38.56s/it][A[A

 65%|██████▌   | 13/20 [08:15<04:30, 38.70s/it][A[A

 70%|███████   | 14/20 [08:54<03:52, 38.83s/it][A[A

 75%|███████▌  | 15/20 [09:34<03:15, 39.04s/it][A[A

 80%|████████  | 16/20 [10:14<02:36, 39.22s/it][A[A

 85%|████████▌ | 17/20 [10:54<01:58, 39.45s/it][A[A

 90%|█████████ | 18/20 [11

In [32]:
print(X_train_list[0].shape)

(1, 1024)


# Various Models

In [None]:
# data = {"Bag of Words":(X_train_excerpt_bow,X_test_excerpt_bow),"TF-IDF":(X_train_excerpt_tfidf,X_test_excerpt_tfidf),"Avg Word2Vec":(X_train_excerpt_avgw2v,X_test_excerpt_avgw2v)}
data = {"Avg Word2Vec Pos":(X_train_excerpt_avg_w2v_pos,X_test_excerpt_avg_w2v_pos),"Avg Word2Vec Pos Concat":(X_train_excerpt_avg_w2v_pos_concat,X_test_excerpt_avg_w2v_pos_concat)}

### Function to run all models together

In [None]:
def modelfunction(modelname,data,Y_train,Y_test,param):
  for key,value in tqdm(data.items()):
    train = value[0]
    test = value[1]
    gscv = GridSearchCV(modelname, param_grid=param,cv=5,verbose=10,n_jobs=-1)
    results = gscv.fit(train, Y_train)
    best_estimate = gscv.best_estimator_

    Y_pred_train = best_estimate.predict(train)
    train_residuals = Y_train - Y_pred_train
    Y_pred_test = best_estimate.predict(test)
    test_residuals = Y_test - Y_pred_test
    mse_train = metrics.mean_squared_error(Y_train,Y_pred_train)
    mse_test = metrics.mean_squared_error(Y_test,Y_pred_test)

    print("-"*100)
    print("The model is run on {}: ".format(key))
    print("="*100)
    print(best_estimate)
    print("*"*50)
    print("Root Mean squared Error Train for {}: ".format(key), np.sqrt(mse_train))
    print("Root Mean squared Error Test for {}: ".format(key), np.sqrt(mse_test))

    plt.subplot(1,2,1)
    plt.scatter(train_residuals.index,train_residuals)
    plt.title("Train Error Plot")
    plt.xlabel('Index')
    plt.ylabel('Actual - Predicted')

    plt.subplot(1,2,2)
    plt.scatter(test_residuals.index,test_residuals)
    plt.title("Test Error Plot")
    plt.xlabel('Index')
    plt.ylabel('Actual - Predicted')
    plt.show()

### Lasso Regression

In [None]:
lasso_params = {'alpha':[0.0001,0.0005,0.001,0.005,0.01,0.05,0.1]}
lasso = Lasso()

modelfunction(lasso,data,Y_train,Y_test,lasso_params)