<a href="https://colab.research.google.com/github/larissapoghosyan/Capstone_Project/blob/main/baseline_classifiers_IMDb_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Import libraries

In [None]:
from tqdm import tqdm
import warnings
import h5py
import numpy
import numpy as np
import pandas as pd
import nltk
from nltk import ngrams, sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
import multiprocessing
import time

In [None]:
np.random.seed(42)

#### Label Encoding

In [None]:
dataset = pd.read_csv('/content/IMDb_Reviews.csv',engine='python', error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
le = LabelEncoder()
le_fitted = le.fit_transform(dataset.iloc[:,-1])
dataset.iloc[:,-1] = le_fitted.astype('int')
label_col = np.array(dataset['sentiment']).reshape(len(dataset),1)
label_col.shape

(50000, 1)

In [None]:
dataset.shape

(50000, 2)

In [None]:
print(len(np.unique(le_fitted)), '\n', np.unique(le_fitted))

2 
 [0 1]


### Splitting the Dataset

In [None]:
def Splitter(df):
  np.random.seed(0)
  msk = np.random.rand(len(df)) < 0.8
  train = df[msk]
  test = df[~msk]
  return(train, test)

## BERT Embeddings

### BERT

In [None]:
# no CLS token avg
hf_hidden = h5py.File('/content/bert_imdb_token_avg.h5', 'r')
br_lst_hidden = np.array(hf_hidden.get('bert_imdb_token_avg'))
hf_hidden.close()
print(br_lst_hidden.shape)
br_lst_hidden = np.append(br_lst_hidden, label_col, axis=1)
print(br_lst_hidden.shape)

In [None]:
br_lst_hidden_df = pd.DataFrame(br_lst_hidden)
br_lst_hidden_prc20 = np.array(br_lst_hidden_df.sample(int(br_lst_hidden_df.shape[0]*0.2),
                                                       random_state=22))
print(br_lst_hidden_prc20.shape)

(10000, 769)


In [None]:
# CLS Concatenated from last 4 layers
hf_hidden = h5py.File('/content/bert_imdb_CLS_cat.h5', 'r')
bert_cat_hidden_4lyr_data = np.array(hf_hidden.get('bert_imdb_CLS_cat'))
hf_hidden.close()
print(bert_cat_hidden_4lyr_data.shape)
bert_cat_hidden_4lyr = np.append(bert_cat_hidden_4lyr_data, label_col, axis=1)
print(bert_cat_hidden_4lyr.shape)

(50000, 3072)
(50000, 3073)


In [None]:
# 20% sampled
bert_cat_hidden_4lyr_df = pd.DataFrame(bert_cat_hidden_4lyr)
bert_cat_hidden_4lyr_prc20 = np.array(bert_cat_hidden_4lyr_df.sample(int(bert_cat_hidden_4lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(bert_cat_hidden_4lyr_prc20.shape)

(10000, 3073)


In [None]:
# CLS Concatenated from last 3 layers
n = 768 * 3
bert_cat_hidden_3lyr = np.append(bert_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(bert_cat_hidden_3lyr.shape)

(50000, 2305)


In [None]:
# 20% sampled
bert_cat_hidden_3lyr_df = pd.DataFrame(bert_cat_hidden_3lyr)
bert_cat_hidden_3lyr_prc20 = np.array(bert_cat_hidden_3lyr_df.sample(int(bert_cat_hidden_3lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(bert_cat_hidden_3lyr_prc20.shape)

(10000, 2305)


In [None]:
# CLS Concatenated from last 2 layers
n = 768 * 2
bert_cat_hidden_2lyr = np.append(bert_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(bert_cat_hidden_2lyr.shape)

(50000, 1537)


In [None]:
# 20% sampled
bert_cat_hidden_2lyr_df = pd.DataFrame(bert_cat_hidden_2lyr)
bert_cat_hidden_2lyr_prc20 = np.array(bert_cat_hidden_2lyr_df.sample(int(bert_cat_hidden_2lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(bert_cat_hidden_2lyr_prc20.shape)

(10000, 1537)


In [None]:
# CLS Concatenated from last 1 layer
n = 768 * 1
bert_cat_hidden_1lyr = np.append(bert_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(bert_cat_hidden_1lyr.shape)

(50000, 769)


In [None]:
# 20% sampled
bert_cat_hidden_1lyr_df = pd.DataFrame(bert_cat_hidden_1lyr)
bert_cat_hidden_1lyr_prc20 = np.array(bert_cat_hidden_1lyr_df.sample(int(bert_cat_hidden_1lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(bert_cat_hidden_1lyr_prc20.shape)

(10000, 769)


### Robert

In [None]:
# no CLS token avg
hf_hidden = h5py.File('/content/robert_imdb_token_avg.h5', 'r')
rb_lst_hidden = np.array(hf_hidden.get('robert_imdb_token_avg'))
hf_hidden.close()
print(rb_lst_hidden.shape)
rb_lst_hidden = np.append(rb_lst_hidden, label_col, axis=1)
rb_lst_hidden.shape

(50000, 768)


(50000, 769)

In [None]:
# 20% sampled
rb_lst_hidden_df = pd.DataFrame(rb_lst_hidden)
rb_lst_hidden_prc20 = np.array(rb_lst_hidden_df.sample(int(rb_lst_hidden_df.shape[0]*0.2),
                                                       random_state=22))
print(rb_lst_hidden_prc20.shape)

(10000, 769)


In [None]:
# CLS Concatenated from last 4 layers
hf_hidden = h5py.File('/content/robert_imdb_CLS_cat.h5', 'r')
rb_cat_hidden_4lyr_data = np.array(hf_hidden.get('robert_imdb_CLS_cat'))
hf_hidden.close()
print(rb_cat_hidden_4lyr_data.shape)
rb_cat_hidden_4lyr = np.append(rb_cat_hidden_4lyr_data, label_col, axis=1)
print(rb_cat_hidden_4lyr.shape)

(50000, 3072)
(50000, 3073)


In [None]:
# 20% sampled
rb_cat_hidden_4lyr_df = pd.DataFrame(rb_cat_hidden_4lyr)
rb_cat_hidden_4lyr_prc20 = np.array(rb_cat_hidden_4lyr_df.sample(int(rb_cat_hidden_4lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(rb_cat_hidden_4lyr_prc20.shape)

(10000, 3073)


In [None]:
# CLS Concatenated from last 3 layers
n = 768 * 3
rb_cat_hidden_3lyr = np.append(rb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(rb_cat_hidden_3lyr.shape)

(50000, 2304)
(50000, 2305)


In [None]:
# 20% sampled
rb_cat_hidden_3lyr_df = pd.DataFrame(rb_cat_hidden_3lyr)
rb_cat_hidden_3lyr_prc20 = np.array(rb_cat_hidden_3lyr_df.sample(int(rb_cat_hidden_3lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(rb_cat_hidden_3lyr_prc20.shape)

(10000, 2305)


In [None]:
# CLS Concatenated from last 2 layers
n = 768 * 2
rb_cat_hidden_2lyr = np.append(rb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(rb_cat_hidden_2lyr.shape)

(50000, 1537)


In [None]:
# 20% sampled
rb_cat_hidden_2lyr_df = pd.DataFrame(rb_cat_hidden_2lyr)
rb_cat_hidden_2lyr_prc20 = np.array(rb_cat_hidden_2lyr_df.sample(int(rb_cat_hidden_2lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(rb_cat_hidden_2lyr_prc20.shape)

(10000, 1537)


In [None]:
# CLS Concatenated from last 1 layer
n = 768 * 1
rb_cat_hidden_1lyr = np.append(rb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(rb_cat_hidden_1lyr.shape)

(50000, 769)


In [None]:
# 20% sampled
rb_cat_hidden_1lyr_df = pd.DataFrame(rb_cat_hidden_1lyr)
rb_cat_hidden_1lyr_prc20 = np.array(rb_cat_hidden_1lyr_df.sample(int(rb_cat_hidden_1lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(rb_cat_hidden_1lyr_prc20.shape)

(10000, 769)


### AlBERT

In [None]:
# no CLS token avg
hf_hidden = h5py.File('/content/albert_imdb_token_avg.h5', 'r')
ab_lst_hidden = np.array(hf_hidden.get('albert_imdb_token_avg'))
hf_hidden.close()
print(ab_lst_hidden.shape)
ab_lst_hidden = np.append(ab_lst_hidden, label_col, axis=1)
ab_lst_hidden.shape

(50000, 768)


(50000, 769)

In [None]:
# 20% sampled
ab_lst_hidden_df = pd.DataFrame(ab_lst_hidden)
ab_lst_hidden_prc20 = np.array(ab_lst_hidden_df.sample(int(ab_lst_hidden_df.shape[0]*0.2),
                                                       random_state=22))
print(ab_lst_hidden_prc20.shape)

(10000, 769)


In [None]:
# CLS Concatenated from last 4 layers
hf_hidden = h5py.File('/content/albert_imdb_CLS_cat.h5', 'r')
ab_cat_hidden_4lyr_data = np.array(hf_hidden.get('albert_imdb_CLS_cat'))
hf_hidden.close()
print(ab_cat_hidden_4lyr_data.shape)
ab_cat_hidden_4lyr = np.append(ab_cat_hidden_4lyr_data, label_col, axis=1)
print(ab_cat_hidden_4lyr.shape)

(50000, 3072)
(50000, 3073)


In [None]:
# 20% sampled
ab_cat_hidden_4lyr_df = pd.DataFrame(ab_cat_hidden_4lyr)
ab_cat_hidden_4lyr_prc20 = np.array(ab_cat_hidden_4lyr_df.sample(int(ab_cat_hidden_4lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(ab_cat_hidden_4lyr_prc20.shape)

(10000, 3073)


In [None]:
# CLS Concatenated from last 3 layers
n = 768 * 3
ab_cat_hidden_3lyr = np.append(ab_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(ab_cat_hidden_3lyr.shape)

(50000, 2305)


In [None]:
# 20% sampled
ab_cat_hidden_3lyr_df = pd.DataFrame(ab_cat_hidden_3lyr)
ab_cat_hidden_3lyr_prc20 = np.array(ab_cat_hidden_3lyr_df.sample(int(ab_cat_hidden_3lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(ab_cat_hidden_3lyr_prc20.shape)

(10000, 2305)


In [None]:
# CLS Concatenated from last 2 layers
n = 768 * 2
ab_cat_hidden_2lyr = np.append(ab_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(ab_cat_hidden_2lyr.shape)

(50000, 1537)


In [None]:
# 20% sampled
ab_cat_hidden_2lyr_df = pd.DataFrame(ab_cat_hidden_2lyr)
ab_cat_hidden_2lyr_prc20 = np.array(ab_cat_hidden_2lyr_df.sample(int(ab_cat_hidden_2lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(ab_cat_hidden_2lyr_prc20.shape)

(10000, 1537)


In [None]:
n = 768 * 1
ab_cat_hidden_1lyr = np.append(ab_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(ab_cat_hidden_1lyr.shape)

(50000, 769)


In [None]:
# 20% sampled
ab_cat_hidden_1lyr_df = pd.DataFrame(ab_cat_hidden_1lyr)
ab_cat_hidden_1lyr_prc20 = np.array(ab_cat_hidden_1lyr_df.sample(int(ab_cat_hidden_1lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(ab_cat_hidden_1lyr_prc20.shape)

(10000, 769)


### DistilBERT

In [None]:
# no CLS token avg
hf_hidden = h5py.File('/content/distilbert_imdb_token_avg.h5', 'r')
db_lst_hidden = np.array(hf_hidden.get('distilbert_imdb_token_avg'))
hf_hidden.close()
print(db_lst_hidden.shape)
db_lst_hidden = np.append(db_lst_hidden, label_col, axis=1)
db_lst_hidden.shape

(50000, 768)


(50000, 769)

In [None]:
db_lst_hidden_df = pd.DataFrame(db_lst_hidden)
db_lst_hidden_prc20 = np.array(db_lst_hidden_df.sample(int(db_lst_hidden_df.shape[0]*0.2),
                                                       random_state=22))
print(db_lst_hidden_prc20.shape)

(10000, 769)


In [None]:
# CLS Concatenated from last 4 layers
hf_hidden = h5py.File('/content/distilbert_imdb_CLS_cat.h5', 'r')
db_cat_hidden_4lyr_data = np.array(hf_hidden.get('distilbert_imdb_CLS_cat'))
hf_hidden.close()
print(db_cat_hidden_4lyr_data.shape)
db_cat_hidden_4lyr = np.append(db_cat_hidden_4lyr_data, label_col, axis=1)
print(db_cat_hidden_4lyr.shape)

In [None]:
# 20% sampled
db_cat_hidden_4lyr_df = pd.DataFrame(db_cat_hidden_4lyr)
db_cat_hidden_4lyr_prc20 = np.array(db_cat_hidden_4lyr_df.sample(int(db_cat_hidden_4lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(db_cat_hidden_4lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 3 layers
n = 768 * 3
db_cat_hidden_3lyr = np.append(db_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(db_cat_hidden_3lyr.shape)

In [None]:
# 20% sampled
db_cat_hidden_3lyr_df = pd.DataFrame(db_cat_hidden_3lyr)
db_cat_hidden_3lyr_prc20 = np.array(db_cat_hidden_3lyr_df.sample(int(db_cat_hidden_3lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(db_cat_hidden_3lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 2 layers
n = 768 * 2
db_cat_hidden_2lyr = np.append(db_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(db_cat_hidden_2lyr.shape)

In [None]:
# 20% sampled
db_cat_hidden_2lyr_df = pd.DataFrame(db_cat_hidden_2lyr)
db_cat_hidden_2lyr_prc20 = np.array(db_cat_hidden_2lyr_df.sample(int(db_cat_hidden_2lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(db_cat_hidden_2lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 1 layers
n = 768 * 1
db_cat_hidden_1lyr = np.append(db_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(db_cat_hidden_1lyr.shape)

In [None]:
# 20% sampled
db_cat_hidden_1lyr_df = pd.DataFrame(db_cat_hidden_1lyr)
db_cat_hidden_1lyr_prc20 = np.array(db_cat_hidden_1lyr_df.sample(int(db_cat_hidden_1lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(db_cat_hidden_1lyr_prc20.shape)

### TinyBERT

In [None]:
# no CLS token avg
hf_hidden = h5py.File('/content/tinybert_imdb_token_avg.h5', 'r')
tb_lst_hidden = np.array(hf_hidden.get('tinybert_imdb_token_avg'))
hf_hidden.close()
print(tb_lst_hidden.shape)
tb_lst_hidden = np.append(tb_lst_hidden, label_col, axis=1)
tb_lst_hidden.shape

In [None]:
tb_lst_hidden_df = pd.DataFrame(tb_lst_hidden)
tb_lst_hidden_prc20 = np.array(tb_lst_hidden_df.sample(int(tb_lst_hidden_df.shape[0]*0.2),
                                                       random_state=22))
print(tb_lst_hidden_prc20.shape)

(10000, 769)


In [None]:
# CLS Concatenated from last 4 layers
hf_hidden = h5py.File('/content/tinybert_imdb_CLS_cat.h5', 'r')
tb_cat_hidden_4lyr_data = np.array(hf_hidden.get('tinybert_imdb_CLS_cat'))
hf_hidden.close()
print(tb_cat_hidden_4lyr_data.shape)
tb_cat_hidden_4lyr = np.append(tb_cat_hidden_4lyr_data, label_col, axis=1)
print(tb_cat_hidden_4lyr.shape)

In [None]:
# 20% sampled
tb_cat_hidden_4lyr_df = pd.DataFrame(tb_cat_hidden_4lyr)
tb_cat_hidden_4lyr_prc20 = np.array(tb_cat_hidden_4lyr_df.sample(int(tb_cat_hidden_4lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(tb_cat_hidden_4lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 3 layers
n = 768 * 3
tb_cat_hidden_3lyr = np.append(tb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(tb_cat_hidden_3lyr.shape)

In [None]:
# 20% sampled
tb_cat_hidden_3lyr_df = pd.DataFrame(tb_cat_hidden_3lyr)
tb_cat_hidden_3lyr_prc20 = np.array(tb_cat_hidden_3lyr_df.sample(int(tb_cat_hidden_3lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(tb_cat_hidden_3lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 2 layers
n = 768 * 2
tb_cat_hidden_2lyr = np.append(tb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(tb_cat_hidden_2lyr.shape)

In [None]:
# 20% sampled
tb_cat_hidden_2lyr_df = pd.DataFrame(tb_cat_hidden_2lyr)
tb_cat_hidden_2lyr_prc20 = np.array(tb_cat_hidden_2lyr_df.sample(int(tb_cat_hidden_2lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(tb_cat_hidden_2lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 1 layers
n = 768 * 1
tb_cat_hidden_1lyr = np.append(tb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(tb_cat_hidden_1lyr.shape)

In [None]:
# 20% sampled
tb_cat_hidden_1lyr_df = pd.DataFrame(tb_cat_hidden_1lyr)
tb_cat_hidden_1lyr_prc20 = np.array(tb_cat_hidden_1lyr_df.sample(int(tb_cat_hidden_1lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(tb_cat_hidden_1lyr_prc20.shape)

### Sentence-BERT

In [None]:
# no CLS token avg
hf_hidden = h5py.File('/content/sentence_bert_imdb_token_avg.h5', 'r')
sb_lst_hidden = np.array(hf_hidden.get('sentence_bert_imdb_token_avg'))
hf_hidden.close()
print(sb_lst_hidden.shape)
sb_lst_hidden = np.append(sb_lst_hidden, label_col, axis=1)
sb_lst_hidden.shape

(50000, 768)


(50000, 769)

In [None]:
# 20% sampled
sb_lst_hidden_df = pd.DataFrame(sb_lst_hidden)
sb_lst_hidden_prc20 = np.array(sb_lst_hidden_df.sample(int(sb_lst_hidden_df.shape[0]*0.2),
                                                       random_state=22))
print(sb_lst_hidden_prc20.shape)

(10000, 769)


In [None]:
# CLS Concatenated from last 4 layers
hf_hidden = h5py.File('/content/sentence_bert_imdb_CLS_cat.h5', 'r')
sb_cat_hidden_4lyr_data = np.array(hf_hidden.get('sentence_bert_imdb_CLS_cat'))
hf_hidden.close()
print(sb_cat_hidden_4lyr_data.shape)
sb_cat_hidden_4lyr = np.append(sb_cat_hidden_4lyr_data, label_col, axis=1)
print(sb_cat_hidden_4lyr.shape)

In [None]:
# 20% sampled
sb_cat_hidden_4lyr_df = pd.DataFrame(sb_cat_hidden_4lyr)
sb_cat_hidden_4lyr_prc20 = np.array(sb_cat_hidden_4lyr_df.sample(int(sb_cat_hidden_4lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(sb_cat_hidden_4lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 3 layers
n = 768 * 3
sb_cat_hidden_3lyr = np.append(sb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(sb_cat_hidden_3lyr.shape)

In [None]:
# 20% sampled
sb_cat_hidden_3lyr_df = pd.DataFrame(sb_cat_hidden_3lyr)
sb_cat_hidden_3lyr_prc20 = np.array(sb_cat_hidden_3lyr_df.sample(int(sb_cat_hidden_3lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(sb_cat_hidden_3lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 2 layers
n = 768 * 2
sb_cat_hidden_2lyr = np.append(sb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(sb_cat_hidden_2lyr.shape)

In [None]:
# 20% sampled
sb_cat_hidden_2lyr_df = pd.DataFrame(sb_cat_hidden_2lyr)
sb_cat_hidden_2lyr_prc20 = np.array(sb_cat_hidden_2lyr_df.sample(int(sb_cat_hidden_2lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(sb_cat_hidden_2lyr_prc20.shape)

In [None]:
# CLS Concatenated from last 1 layers
n = 768 * 1
sb_cat_hidden_1lyr = np.append(sb_cat_hidden_4lyr_data[:,:n], label_col, axis=1)
print(sb_cat_hidden_1lyr.shape)

In [None]:
# 20% sampled
sb_cat_hidden_1lyr_df = pd.DataFrame(sb_cat_hidden_1lyr)
sb_cat_hidden_1lyr_prc20 = np.array(sb_cat_hidden_1lyr_df.sample(int(sb_cat_hidden_1lyr_df.shape[0]*0.2),
                                                                     random_state=22))
print(sb_cat_hidden_1lyr_prc20.shape)

### Splitting

In [None]:
training, testing = (Splitter(db_lst_hidden_prc20)[0], Splitter(db_lst_hidden_prc20)[1])
train_hidden, train_lab_hidden = (training[:,:training.shape[1]-1], training[:,-1])
test_hidden, test_lab_hidden = (testing[:,:testing.shape[1]-1], testing[:,-1])
train_hidden = train_hidden.astype('float64')
test_hidden = test_hidden.astype('float64')
train_lab_hidden = train_lab_hidden.astype('int')
test_lab_hidden = test_lab_hidden.astype('int')

In [None]:
print('Shape of the Train data is %s, Shape of Train labels is %s ' % (train_hidden.shape, train_lab_hidden.shape))
print('Shape of the Test data is  %s,  Shape of Test labels is %s' % (test_hidden.shape, test_lab_hidden.shape))

Shape of the Train data is (8028, 768), Shape of Train labels is (8028,) 
Shape of the Test data is  (1972, 768),  Shape of Test labels is (1972,)


## Word2vec embeddings

In [None]:
hf_wv_ = h5py.File('/content/wv_vectors_imdb.h5', 'r')
# hf_wv_.keys()
embedding_wv = hf_wv_.get('vec_lis_hf')
embedding_wv = np.array(embedding_wv)
print(embedding_wv.shape)
hf_wv_.close()

(50000, 300)


In [None]:
embedding_wv_full = np.append(embedding_wv, label_col, axis=1)
print(embedding_wv_full.shape)

(50000, 301)


In [None]:
embedding_wv_df = pd.DataFrame(embedding_wv_full)
embedding_wv_sample = np.array(embedding_wv_df.sample(int(embedding_wv_df.shape[0]*0.2),
                                                       random_state=22))
print(embedding_wv_sample.shape)

(10000, 301)


## Fast TExt Embeddings

In [None]:
hf_ft_ = h5py.File('/content/ft_vectors_imdb.h5', 'r')
embedding_ft = hf_ft_.get('vec_lis_hf')
embedding_ft = np.array(embedding_ft)
print(embedding_ft.shape)
hf_ft_.close()

(50000, 300)


In [None]:
embedding_ft_full = np.append(embedding_ft, label_col, axis=1)
print(embedding_ft_full.shape)

(50000, 301)


In [None]:
embedding_ft_df = pd.DataFrame(embedding_ft_full)
embedding_ft_sample = np.array(embedding_ft_df.sample(int(embedding_ft_df.shape[0]*0.2),
                                                       random_state=22))
print(embedding_ft_sample.shape)

(10000, 301)


## ELMo Embeddings

In [None]:
dataset_prc20 = dataset.sample(int(dataset.shape[0]*0.2),
                            random_state=22)
label_col_imdb = np.array(dataset_prc20.iloc[:,-1]).reshape(dataset_prc20.shape[0],1)

In [None]:
hf_elmo = h5py.File('/content/drive/MyDrive/Lariba/IMDb/elmo_imdb_clf.h5', 'r')
hf_elmo_data = np.array(hf_elmo.get('elmo_imdb_clf'))
hf_elmo.close()
hf_elmo = np.append(hf_elmo_data, label_col_imdb, axis=1)

## Classification on Bert, Word2Vec, Fast Text and ELMo

### **Bert** Full datasets

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = LogisticRegression(solver = "lbfgs", random_state = 0)
    clf.fit(train_hidden, train_lab_hidden)

BERT

In [None]:
print('Train score no CLS token avg BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg BERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score no CLS token avg BERT 0.8771316571371501
Test score no CLS token avg BERT 0.8761684591416222


In [None]:
print('Train score CLS concatenated last 4 layers BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers BERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 4 layers BERT 0.8441487103942473
Test score CLS concatenated last 4 layers BERT 0.8419941702683686


In [None]:
print('Train score CLS concatenated last 3 layers BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers BERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 3 layers BERT 0.8420014481536041
Test score CLS concatenated last 3 layers BERT 0.8393808422957081


In [None]:
print('Train score CLS concatenated last 2 layers BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers BERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 2 layers BERT 0.8135876757134654
Test score CLS concatenated last 2 layers BERT 0.8126444868831039


In [None]:
print('Train score CLS concatenated last 1 layer BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer BERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 1 layer BERT 0.7828019275423834
Test score CLS concatenated last 1 layer BERT 0.7783696853955171


RoBERT

In [None]:
print('Train score no CLS token avg RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 4 layers RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 4 layers RoBERT 0.8537864223115528
Test score CLS concatenated last 4 layers RoBERT 0.8517438938586792


In [None]:
print('Train score CLS concatenated last 3 layers RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 3 layers RoBERT 0.824773413897281
Test score CLS concatenated last 3 layers RoBERT 0.8231983113880792


In [None]:
print('Train score CLS concatenated last 2 layers RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 2 layers RoBERT 0.813862325534943
Test score CLS concatenated last 2 layers RoBERT 0.8182731932857573


In [None]:
print('Train score CLS concatenated last 1 layer RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 1 layer RoBERT 0.8166587600808969
Test score CLS concatenated last 1 layer RoBERT 0.8220926726304151


AlBERT

In [None]:
print('Train score no CLS token avg AlBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg AlBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score no CLS token avg AlBERT 0.677511173254101
Test score no CLS token avg AlBERT 0.6797668107347472


In [None]:
print('Train score CLS concatenated last 4 layers AlBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers AlBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 4 layers AlBERT 0.6793588175076777
Test score CLS concatenated last 4 layers AlBERT 0.6802693738064127


In [None]:
print('Train score CLS concatenated last 3 layers AlBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers AlBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 3 layers AlBERT 0.6819305385633317
Test score CLS concatenated last 3 layers AlBERT 0.6787616845914162


In [None]:
print('Train score CLS concatenated last 2 layers AlBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers AlBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 2 layers AlBERT 0.6789842950238446
Test score CLS concatenated last 2 layers AlBERT 0.6772539953764197


In [None]:
print('Train score CLS concatenated last 1 layer AlBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer AlBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score CLS concatenated last 1 layer AlBERT 0.6684726973109286
Test score CLS concatenated last 1 layer AlBERT 0.6677052970147753


DistilBERT

In [None]:
print('Train score no CLS token avg DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token vag DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score no CLS token avg DistilBERT 0.8690419714863549
Test score no CLS token vag DistilBERT 0.8662177103226455


In [None]:
print('Train score CLS concatenated last 4 layers DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers AlBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers AlBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer AlBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer AlBERT %s' % clf.score(test_hidden, test_lab_hidden))

TinyBERT

In [None]:
print('Train score no CLS TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score no CLS TinyBERT 0.8327881950513096
Test score no CLS TinyBERT 0.8316413709920595


In [None]:
print('Train score CLS concatenated last 4 layers TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score only CLS TinyBERT 0.8147611794961425
Test score only CLS TinyBERT 0.811840385968439


In [None]:
print('Train score CLS concatenated last 2 layers TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

Sentence-BERT

In [None]:
print('Train score no CLS TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

Train score no CLS TinyBERT 0.8810017228034256
Test score no CLS TinyBERT 0.8798874258719469


In [None]:
print('Train score CLS concatenated last 4 layers Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

### **BERT** 20% of data Sampled

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = LogisticRegression(solver = "lbfgs", random_state = 0)
    clf.fit(train_hidden, train_lab_hidden)

BERT

In [None]:
print('Train score no CLS token avg BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 4 layers BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer BERT %s' % clf.score(test_hidden, test_lab_hidden))

RoBERT

In [None]:
print('Train score no CLS token avg RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 4 layers RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer RoBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer RoBERT %s' % clf.score(test_hidden, test_lab_hidden))

ALBERT

In [None]:
print('Train score no CLS token avg ALBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg ALBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 4 layers ALBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers ALBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers ALBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers ALBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers ALBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers ALBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer ALBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer ALBERT %s' % clf.score(test_hidden, test_lab_hidden))

DistilBERT

In [None]:
print('Train score no CLS token avg DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 4 layers DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer DistilBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer DistilBERT %s' % clf.score(test_hidden, test_lab_hidden))

TinyBERT

In [None]:
print('Train score no CLS token avg TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 4 layers TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer TinyBERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer TinyBERT %s' % clf.score(test_hidden, test_lab_hidden))

Sentence-BERT

In [None]:
print('Train score no CLS token avg Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score no CLS token avg Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 4 layers Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 4 layers Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 3 layers Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 3 layers Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 2 layers Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 2 layers Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

In [None]:
print('Train score CLS concatenated last 1 layer Sentence-BERT %s' % clf.score(train_hidden, train_lab_hidden))
print('Test score CLS concatenated last 1 layer Sentence-BERT %s' % clf.score(test_hidden, test_lab_hidden))

### Word2Vec

Splitting the Data

In [None]:
## Input embedding_wv_full to get estimates for the full dataset
training, testing = (Splitter(embedding_wv_sample)[0], Splitter(embedding_wv_sample)[1])
train_dat_wv, train_lab_wv = (training[:,:training.shape[1]-1], training[:,-1])
test_dat_wv, test_lab_wv = (testing[:,:testing.shape[1]-1], testing[:,-1])
train_dat_wv = train_dat_wv.astype('float64')
test_dat_wv = test_dat_wv.astype('float64')
train_lab_wv = train_lab_wv.astype('int')
test_lab_wv = test_lab_wv.astype('int')

In [None]:
print('Shape of the Train data is %s, Shape of Train labels is %s ' % (train_dat_wv.shape, train_lab_wv.shape))
print('Shape of the Test data is  %s,  Shape of Test labels is %s' % (test_dat_wv.shape, test_lab_wv.shape))

Shape of the Train data is (8028, 300), Shape of Train labels is (8028,) 
Shape of the Test data is  (1972, 300),  Shape of Test labels is (1972,)


Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = LogisticRegression(solver = "lbfgs",
                             random_state = 0)
    clf.fit(train_dat_wv, train_lab_wv)

In [None]:
print('Train score on full data %s' % clf.score(train_dat_wv, train_lab_wv))
print('Test score on full data %s' % clf.score(test_dat_wv, test_lab_wv))

Train score on full data 0.8355754857997011
Test score on full data 0.8331643002028397


In [None]:
print('Train score on sample data %s' % clf.score(train_dat_wv, train_lab_wv))
print('Test score on sample data %s' % clf.score(test_dat_wv, test_lab_wv))

Train score on sample data 0.8355754857997011
Test score on sample data 0.8331643002028397


### Fast Text

Splitting the Data

In [None]:
## Input embedding_ft_full to get estimates for the full dataset
training, testing = (Splitter(embedding_ft_sample)[0], Splitter(embedding_ft_sample)[1])
train_dat_ft, train_lab_ft = (training[:,:training.shape[1]-1], training[:,-1])
test_dat_ft, test_lab_ft = (testing[:,:testing.shape[1]-1], testing[:,-1])
train_dat_ft = train_dat_ft.astype('float64')
test_dat_ft = test_dat_ft.astype('float64')
train_lab_ft = train_lab_ft.astype('int')
test_lab_ft = test_lab_ft.astype('int')

In [None]:
print('Shape of the Train data is %s, Shape of Train labels is %s ' % (train_dat_ft.shape, train_lab_ft.shape))
print('Shape of the Test data is  %s,  Shape of Test labels is %s' % (test_dat_ft.shape, test_lab_ft.shape))

Shape of the Train data is (8028, 300), Shape of Train labels is (8028,) 
Shape of the Test data is  (1972, 300),  Shape of Test labels is (1972,)


Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = LogisticRegression(solver = "lbfgs",
                             random_state = 0)
    clf.fit(train_dat_ft, train_lab_ft)

In [None]:
print('Train score on full data %s' % clf.score(train_dat_ft, train_lab_ft))
print('Test score on full data %s' % clf.score(test_dat_ft, test_lab_ft))

Train score on full data 0.8143996013951171
Test score on full data 0.8113590263691683


In [None]:
print('Train score on sample data %s' % clf.score(train_dat_ft, train_lab_ft))
print('Test score on sample data %s' % clf.score(test_dat_ft, test_lab_ft))

Train score on sample data 0.8143996013951171
Test score on sample data 0.8113590263691683


### ELMo

In [None]:
## Input hf_elmo is the sample dataset already
training, testing = (Splitter(hf_elmo)[0], Splitter(hf_elmo)[1])
train_elmo, train_lab_elmo = (training[:,:training.shape[1]-1], training[:,-1])
test_elmo, test_lab_elmo = (testing[:,:testing.shape[1]-1], testing[:,-1])
train_elmo = train_elmo.astype('float64')
test_elmo = test_elmo.astype('float64')
train_lab_elmo = train_lab_elmo.astype('int')
test_lab_elmo = test_lab_elmo.astype('int')

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = LogisticRegression(solver = "lbfgs", random_state = 0)
    clf.fit(train_hidden, train_lab_hidden)

In [None]:
# imdb 20% 
# (the embedding matrix is already a 20% sample no need to slice)

print(clf.score(train_hidden, train_lab_hidden))
print(clf.score(test_hidden, test_lab_hidden))

## Splitting the data into training and validation sets

In [None]:
# randomly sample 20% of the data

dataframe_intent = pd.DataFrame(dataset)
dataset_prc20 = np.array(dataframe_intent.sample(int(dataframe_intent.shape[0]*0.2),
                                                       random_state=22))
print(dataset_prc20.shape)

In [None]:
data = np.array(dataset) # change to dataset_prc20 for the sampled dataset
training, testing = (Splitter(data)[0], Splitter(data)[1])
train_dat, train_lab = (training[:,:training.shape[1]-1], training[:,-1])
test_dat, test_lab = (testing[:,:testing.shape[1]-1], testing[:,-1])
train_lab = train_lab.astype('int')
test_lab = test_lab.astype('int')

In [None]:
print('Train sentences shape is %s, Train labels shape is %s' % (train_dat.shape, train_lab.shape))
print('Test sentences shape is %s, Test labels shape is %s' % (test_dat.shape, test_lab.shape))


## Baseline classifier using Count Vectorizer on Full Data

In [None]:
#nltk.download('punkt')
vectorizer_count= CountVectorizer()
cv_train_tokens = vectorizer_count.fit_transform(train_dat[:,0])
cv_test_tokens = vectorizer_count.transform(test_dat[:,0])

In [None]:
print(cv_train_tokens.shape, len(train_lab))
print(cv_test_tokens.shape, len(test_lab))

(40051, 92899) 40051
(9949, 92899) 9949


In [None]:
print(train_dat.shape, test_dat.shape)

(40051, 1) (9949, 1)


Logistic Regression Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_count = LogisticRegression(solver = "lbfgs",
                                      random_state=0)
    clf_fit_count = clf_lr_count.fit(cv_train_tokens,train_lab)
    y_pred_count = clf_lr_count.predict(cv_test_tokens)
    y_true_count = test_lab

In [None]:
print(accuracy_score(y_true_count, y_pred_count))

0.8926525278922505


## Baseline classifier using Count Vectorizer on 20% of the Data

In [None]:
vectorizer_count= CountVectorizer()
cv_train_tokens_sample = vectorizer_count.fit_transform(train_dat[:,0])
cv_test_tokens_sample = vectorizer_count.transform(test_dat[:,0])

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_count = LogisticRegression(solver = "lbfgs",
                                      random_state=0)
    clf_fit_count = clf_lr_count.fit(cv_train_tokens_sample,train_lab)
    y_pred_count = clf_lr_count.predict(cv_test_tokens_sample)
    y_true_count = test_lab

In [None]:
print(accuracy_score(y_true_count, y_pred_count))

0.8732251521298174


## Baseline Classifier using TFIDF on full Data

In [None]:
nltk.download('punkt')
vectorizer_tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize)
tf_train_tokens = vectorizer_tfidf.fit_transform(train_dat[:,0])
tf_test_tokens = vectorizer_tfidf.transform(test_dat[:,0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print(tf_train_tokens.shape, len(train_lab))
print(tf_test_tokens.shape, len(test_lab))

(40051, 149233) 40051
(9949, 149233) 9949


Logistic Regression Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_tfidf = LogisticRegression(solver = "lbfgs",
                                      random_state=0)
    clf_fit_tfidf = clf_lr_tfidf.fit(tf_train_tokens,train_lab)
    y_pred_tfidf = clf_lr_tfidf.predict(tf_test_tokens)
    y_true_tfidf = test_lab

In [None]:
print(accuracy_score(y_true_tfidf, y_pred_tfidf))

0.9027037893255604


## Baseline Classifier using TFIDF on 20% of the Data

In [None]:
nltk.download('punkt')
vectorizer_tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize)
tf_train_tokens_sample = vectorizer_tfidf.fit_transform(train_dat[:,0])
tf_test_tokens_sample = vectorizer_tfidf.transform(test_dat[:,0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print(tf_train_tokens.shape, len(train_lab))
print(tf_test_tokens.shape, len(test_lab))

(40051, 2017959) 8028
(9949, 2017959) 1972


In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_tfidf = LogisticRegression(solver = "lbfgs",
                                      random_state=0)
    clf_fit_tfidf = clf_lr_tfidf.fit(tf_train_tokens_sample,train_lab)
    y_pred_tfidf = clf_lr_tfidf.predict(tf_test_tokens_sample)
    y_true_tfidf = test_lab

In [None]:
print(accuracy_score(y_true_tfidf, y_pred_tfidf))

0.8737322515212982


##### Checking inference time of Robert (The Best Performing Model according to the final results)


In [None]:
# We sample 1000 data points randomly and then run the algorithm on those points
data_1000_sample = np.array(dataset.sample(1000))
data_1000_sample.shape

(100, 2)

In [None]:
data = data_1000_sample
training, testing = (Splitter(data)[0], Splitter(data)[1])
train_dat, train_lab = (training[:,:training.shape[1]-1], training[:,-1])
test_dat, test_lab = (testing[:,:testing.shape[1]-1], testing[:,-1])
train_lab = train_lab.astype('int')
test_lab = test_lab.astype('int')

In [None]:
start_time = time.time()

nltk.download('punkt')
vectorizer_tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize)
tf_train_tokens_sample = vectorizer_tfidf.fit_transform(train_dat[:,0])
tf_test_tokens_sample = vectorizer_tfidf.transform(test_dat[:,0])

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_tfidf = LogisticRegression(solver = "lbfgs",
                                      random_state=0)
    clf_fit_tfidf = clf_lr_tfidf.fit(tf_train_tokens_sample,train_lab)
    y_pred_tfidf = clf_lr_tfidf.predict(tf_test_tokens_sample)
    y_true_tfidf = test_lab

end_time = time.time()
print(f'\n Inference ran for {round((end_time -  start_time))} seconds for 1000 datapoints')
print(f' \n For 1 datapoint inference ran for {round((end_time -  start_time)/1000, 2)} seconds')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

 Inference ran for 13 seconds for 1000 datapoints
 
 For 1 datapoint inference ran for 0.01 seconds


## bigram of TF-IDF and CountVectorizer on full Dataset

Initialize Vectorizer

In [None]:
vectorizer_count= CountVectorizer(ngram_range=(2,2))
cv_train_tokens = vectorizer_count.fit_transform(train_dat[:,0])
cv_test_tokens = vectorizer_count.transform(test_dat[:,0])

Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_count = LogisticRegression(solver = "lbfgs",
                                      random_state=0)
    clf_fit_count = clf_lr_count.fit(cv_train_tokens,train_lab)
    y_pred_count = clf_lr_count.predict(cv_test_tokens)
    y_true_count = test_lab

In [None]:
print('accuracy socre for CountVecotrizer is:' ,accuracy_score(y_true_count, y_pred_count))
count_bgram_acc = accuracy_score(y_true_count, y_pred_count)

accuracy socre for CountVecotrizer is: 0.8978791838375716


Initialize Vectorizer

In [None]:
vectorizer_tfidf = TfidfVectorizer(use_idf=True, ngram_range=(2,2))
tf_train_tokens = vectorizer_tfidf.fit_transform(train_dat[:,0])
tf_test_tokens = vectorizer_tfidf.transform(test_dat[:,0])

Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_tfidf = LogisticRegression(solver = "lbfgs",
                                      multi_class='multinomial',
                                      random_state=0)
    clf_fit_tfidf = clf_lr_tfidf.fit(tf_train_tokens,train_lab)
    y_pred_tfidf = clf_lr_tfidf.predict(tf_test_tokens)
    y_true_tfidf = test_lab

In [None]:
print('accuracy socre is for TFIDF:' ,accuracy_score(y_true_tfidf, y_pred_tfidf))
tfidf_bgram_acc = accuracy_score(y_true_tfidf, y_pred_tfidf)

accuracy socre is for TFIDF: 0.8956679063222435


## bigram of TF-IDF and CountVectorizer on 20% of Data

Initialize Vectorizer

In [None]:
vectorizer_count= CountVectorizer(ngram_range=(2,2))
cv_train_tokens_sample = vectorizer_count.fit_transform(train_dat[:,0])
cv_test_tokens_sample = vectorizer_count.transform(test_dat[:,0])

Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_count = LogisticRegression(solver = "lbfgs",
                                      random_state=0)
    clf_fit_count = clf_lr_count.fit(cv_train_tokens_sample,train_lab)
    y_pred_count = clf_lr_count.predict(cv_test_tokens_sample)
    y_true_count = test_lab

In [None]:
print('accuracy socre for CountVecotrizer on 20% of data is:' ,accuracy_score(y_true_count,
                                                                              y_pred_count))
count_bgram_acc = accuracy_score(y_true_count,
                                 y_pred_count)

accuracy socre for CountVecotrizer on 20% of data is: 0.8483772819472617


Initialize Vectorizer

In [None]:
vectorizer_tfidf = TfidfVectorizer(use_idf=True, ngram_range=(2,2))
tf_train_tokens_sample = vectorizer_tfidf.fit_transform(train_dat[:,0])
tf_test_tokens_sample = vectorizer_tfidf.transform(test_dat[:,0])

Classifier

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf_lr_tfidf = LogisticRegression(solver = "lbfgs",
                                      multi_class='multinomial',
                                      random_state=0)
    clf_fit_tfidf = clf_lr_tfidf.fit(tf_train_tokens_sample,train_lab)
    y_pred_tfidf = clf_lr_tfidf.predict(tf_test_tokens_sample)
    y_true_tfidf = test_lab

In [None]:
print('accuracy socre is for TFIDF on 20% of data is:' ,accuracy_score(y_true_tfidf, y_pred_tfidf))
tfidf_bgram_acc = accuracy_score(y_true_tfidf, y_pred_tfidf)

accuracy socre is for TFIDF on 20% of data is: 0.8635902636916836
