<a href="https://colab.research.google.com/github/katearb/Data-Science-Notebooks/blob/master/2_Vector_Representations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers

In [None]:
import re

import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import utils, models
import gensim.parsing.preprocessing as gsp

import torch
from torch import nn
from torch.utils.data import TensorDataset, random_split, DataLoader
import torch.optim as optim

from transformers import BertTokenizer, BertModel

## Data Preparation

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/katearb/files/main/jigsaw-toxic-comment-train.csv/jigsaw-toxic-comment-train.csv')

In [None]:
data['toxic'].value_counts()

0    202165
1     21384
Name: toxic, dtype: int64

In [None]:
# downsampling
false_downsample = resample(data[data['toxic'] == 0],
             replace=True,
             n_samples=len(data[data['toxic'] == 1]) * 2,
             random_state=42)

data = false_downsample.append(data[data['toxic'] == 1])

In [None]:
# select required columns
data = data[['comment_text', 'toxic']]

In [None]:
data.shape

(64152, 2)

In [None]:
data[:10]

Unnamed: 0,comment_text,toxic
134944,"""\n\n \n\nYour request to be unblocked has be...",0
162417,::on that cell - it appears that it made it fr...,0
145873,"""\n Your submission at Articles for creation \...",0
114692,Maybe something more like ?,0
132623,United Kingdom Location\nA British company bas...,0
121978,"""While we can debate all day on whether the no...",0
60771,Please also see: wp:CLAIM. —AsteriskSplat→,0
151880,"""\n\nDohn, see my reply to Powers about Google...",0
186048,""", 7 September 2012 (UTC) \n :::That's good ) ...",0
96821,"""\n\n \n\nYour request to be unblocked has be...",0


### Data split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['comment_text'], data['toxic'], 
                                                    test_size=0.2, stratify=data['toxic'], 
                                                    random_state=42, shuffle=True)

### Data Preprocessing

In [None]:
class Cleaner(BaseEstimator, TransformerMixin):
  def __init__(self, filters):
    self.filters = filters

  def fit(self, X: pd.Series):
    return self

  def _transform_doc(self, s: str):
    s = str(s).lower() # lower case for all words
    s = utils.to_unicode(s)
    for f in self.filters:
        s = f(s)
    return s

  def transform(self, X: pd.Series()):

    clean_X = [self._transform_doc(doc) for doc in tqdm(X)]
    return pd.Series(clean_X)


class Word2Vec(BaseEstimator, TransformerMixin):

  def fit(self, X: pd.Series()):
      return self

  def _transform_doc(self, doc):
    temp = pd.DataFrame()
    for word in doc.split(' '):
      try:
        word_vec = word2vec[word]
        temp = temp.append(pd.Series(word_vec), ignore_index=True)
      except:
        pass
    return temp.mean() 


  def transform(self, X: pd.Series()):

    data_vectors = [self._transform_doc(doc)  for doc in tqdm(X)] 

    return data_vectors


def finetune_logreg(X_train_vectors, y_train, parameters):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  random_search = RandomizedSearchCV(LogisticRegression(), parameters, n_jobs=-1, verbose=3, 
                                     scoring=['roc_auc'],
                                     cv=cv, n_iter=10, refit='roc_auc')

  random_search.fit(X_train_vectors, y_train)
  return random_search.best_estimator_

def calculate_roc_auc(y_true, y_pred_proba):
  score = roc_auc_score(y_true, y_pred_proba)
  print('ROC-AUC score:', score)

  return score

  from ipykernel import kernelapp as app


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

# TF-IDF

In [None]:
# data preparation pipeline
filters = [
           gsp.strip_tags,  # remove tags 
           gsp.strip_punctuation,  # remove punctuation
           gsp.strip_multiple_whitespaces,  # standarize the spaces 
           gsp.strip_numeric,
           gsp.remove_stopwords,  # stopwords  
           gsp.strip_short,  # delete words with len < 3
           gsp.stem_text  # stemming 
          ]

prep_pipeline = Pipeline(steps=[
                                ('cleaner', Cleaner(filters)),
                                ('vectorizer', TfidfVectorizer())
                              ]
                          )
prep_pipeline.fit(X_train)

In [None]:
# prepare data
X_train_tfidf = prep_pipeline.transform(X_train)
X_test_tfidf = prep_pipeline.transform(X_test)

In [None]:
# fit log reg
parameters = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 10),
    'solver' : ['liblinear'],
}

log_clf = finetune_logreg(X_train_tfidf, y_train, parameters)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
# score the test dataset
y_pred_proba = log_clf.predict_proba(X_test_tfidf)
tfidf_rocauc = calculate_roc_auc(y_test, y_pred_proba[:, 1])

ROC-AUC score: 0.9635949616921564


# Word2vec

In [None]:
!brew install wget
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
!gzip -d GoogleNews-vectors-negative300.bin.gz

In [None]:
word2vec = models.KeyedVectors.load_word2vec_format(
    '/content/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# data preparation pipeline
filters = [
           gsp.strip_tags,  # remove tags 
           gsp.strip_punctuation,  # remove punctuation
           gsp.strip_multiple_whitespaces,  # standarized the spaces 
           gsp.strip_numeric,
           gsp.remove_stopwords,  # stop words  
           gsp.strip_short,  # delete words with len < 3
           gsp.stem_text  # stemming 
          ]

prep_pipeline = Pipeline(steps=[
                                ('cleaner', Cleaner(filters)),
                                ('word2vec', Word2Vec())
                              ]
                          )

In [None]:
# vectorize and save train data

# X_train_vectors = prep_pipeline.fit_transform(X_train)

# nan_ids = X_train_vectors[X_train_vectors.isnull().any(1)].index
# X_train_vectors = X_train_vectors[[i not in nan_ids for i in range(len(X_train_vectors))]]
# y_train_w2v = y_train[[i not in nan_ids for i in range(len(y_train))]]

# X_train_vectors.to_csv('/content/drive/MyDrive/embeds/word2vec_vectors_train')
# y_train_w2v.to_csv('/content/drive/MyDrive/embeds/word2vec_y_train')

In [None]:
# load train data
X_train_vectors = pd.read_csv('/content/drive/MyDrive/embeds/word2vec_vectors_train', index_col=0)
y_train_w2v = pd.read_csv('/content/drive/MyDrive/embeds/word2vec_y_train', index_col=0)

In [None]:
# vectorize and save test data

# X_test_vectors = prep_pipeline.fit_transform(X_test)

# nan_ids = X_test_vectors[X_test_vectors.isnull().any(1)].index
# X_test_vectors = X_test_vectors[[i not in nan_ids for i in range(len(X_test_vectors))]]
# y_test_w2v = y_test[[i not in nan_ids for i in range(len(y_test))]]

# X_test_vectors.to_csv('/content/drive/MyDrive/embeds/word2vec_vectors_test')
# y_test_w2v.to_csv('/content/drive/MyDrive/embeds/word2vec_y_test')

In [None]:
# load test data
X_test_vectors = pd.read_csv('/content/drive/MyDrive/embeds/word2vec_vectors_test', index_col=0)
y_test_w2v = pd.read_csv('/content/drive/MyDrive/embeds/word2vec_y_test', index_col=0)

In [None]:
# fit log reg

parameters = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 10),
    'solver' : ['liblinear'],
}

log_clf = finetune_logreg(X_train_vectors, y_train_w2v, parameters)

In [None]:
# score the test dataset
y_pred_proba = log_clf.predict_proba(X_test_vectors)
word2vec_rocauc = calculate_roc_auc(y_test_w2v, y_pred_proba[:, 1])

ROC-AUC score: 0.9317629582827421


# Fine-tune BERT

In [None]:
def validate_bert(val_loader, net, criterion):
  print('**Validation**')
  total_acc_val = 0
  total_loss_val = 0

  with torch.no_grad():
    for seq, attn_masks, labels in tqdm(val_loader):
      labels = labels.float()
      seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

      output = net(seq, attn_masks).squeeze(1)

      batch_loss = criterion(output, labels)
      total_loss_val += batch_loss.item()
      output_round = torch.tensor([[*map(lambda x: round(float(x)), pred)] for pred in output])
      acc = (output_round.to(device) == labels).sum().item()
      total_acc_val += acc
    
  return total_acc_val, total_loss_val

def train_bert(net, criterion, opti, train_loader, val_loader, epochs, path):
    net = net.to(device)
    criterion = criterion.to(device)
    for ep in range(epochs):
      total_acc_train = 0
      total_loss_train = 0
      print(f'**Epoch {ep}**')
      with torch.enable_grad():        
        for seq, attn_masks, labels in tqdm(train_loader):
          opti.zero_grad()  

          seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

          output = net(seq, attn_masks).squeeze(1)

          loss = criterion(output, labels)
          total_loss_train += loss.item()

          loss.backward()
          opti.step()

          output_round = torch.tensor([[*map(lambda x: round(float(x)), pred)] for pred in output])
          acc = (output_round.to(device) == labels).sum().item()
          total_acc_train += acc

      total_acc_val, total_loss_val = validate_bert(val_loader, net, criterion)
      torch.save(net.state_dict(), f'{path}epoch{ep}')
      print(
        f'Epochs: {epochs + 1} | Train Loss: {total_loss_train / len(train_loader): .3f} \
        | Train Accuracy: {total_acc_train / len(train_loader) * 2: .3f} \
        | Val Loss: {total_loss_val / len(val_loader): .3f} \
        | Val Accuracy: {total_acc_val / len(val_loader) * 2: .3f}')
      

def test_bert(net, test_loader):
  pred, true = [], []
  with torch.no_grad():
    for seq, attn_masks, labels in tqdm(test_loader):
      seq, attn_masks = seq.to(device), attn_masks.to(device)
      output = net(seq, attn_masks).squeeze(1)
      true.append(float(labels[0]))
      pred.append(float(output[0][1]))
  return calculate_roc_auc(true, pred)

In [None]:
class BertDataset(torch.utils.data.Dataset):

    def __init__(self, X, y=None, maxlen=512):

        self.X = list(X)
        self.y = list(y) if y is not None else None

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):

        sentence = self.X[index]
        label = self.y[index] if self.y is not None else None

        tokens = self.tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) 
        tokens_ids_tensor = torch.tensor(tokens_ids)

        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, torch.tensor(label)


class BertClassifier(nn.Module):

    def __init__(self, freeze_bert=True):
        super(BertClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        self.fc1 = nn.Linear(768, 100)
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, seq, attn_masks):
        _, x =  self.bert_layer(input_ids=seq, attention_mask=attn_masks, 
                                      return_dict=False)

        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)

        return self.sigmoid(x)

In [None]:
# define cleaner
filters = [
           gsp.strip_tags,  # remove tags 
           gsp.strip_punctuation,  # remove punctuation
           gsp.strip_multiple_whitespaces,  # standarized the spaces 
           gsp.strip_numeric,
           gsp.remove_stopwords,  # stop words
          ]
cleaner = Cleaner(filters)

In [None]:
# prepare data
y_train_bert = [[0., 1.] if label == 1 else [1., 0.] for label in y_train]
X_train_bert, X_val_bert, y_train_bert, y_val_bert = train_test_split(X_train, y_train_bert, 
                                                    test_size=0.1, stratify=y_train_bert, 
                                                    random_state=42)

X_train_bert = cleaner.fit_transform(X_train_bert)
X_val_bert = cleaner.fit_transform(X_val_bert)
X_test_bert = cleaner.fit_transform(X_test)

100%|██████████| 46188/46188 [00:03<00:00, 14300.37it/s]
100%|██████████| 5133/5133 [00:00<00:00, 14701.20it/s]
100%|██████████| 12831/12831 [00:00<00:00, 13672.04it/s]


In [None]:
# create loaders
train_set = BertDataset(X_train_bert, y_train_bert, maxlen=256)
val_set = BertDataset(X_val_bert, y_val_bert, maxlen=256)
test_set = BertDataset(X_test_bert,  y_test, maxlen=256)

train_loader = DataLoader(train_set, batch_size=8, num_workers=2)
val_loader = DataLoader(val_set, batch_size=8, num_workers=2)
test_loader = DataLoader(test_set, batch_size=1, num_workers=2)

In [None]:
# define model, opimizer, criterion
bert_clf = BertClassifier(freeze_bert=True)
criterion = nn.BCELoss()
opti = optim.Adam(bert_clf.parameters(), lr=0.001)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# load trained model
# bert_clf.load_state_dict(torch.load(r'/content/drive/MyDrive/bert_classifier/epoch1'))
# bert_clf.to(device)
# bert_clf.eval()

In [None]:
# train bert_clf
train_bert(bert_clf, criterion, opti, train_loader, val_loader, 2, '/content/drive/MyDrive/bert_classifier/')

**Epoch 0**


100%|██████████| 5774/5774 [12:15<00:00,  7.85it/s]


**Validation**


100%|██████████| 642/642 [01:21<00:00,  7.89it/s]


Epochs: 3 | Train Loss:  0.337         | Train Accuracy:  27.330         | Val Loss:  0.345         | Val Accuracy:  27.293
**Epoch 1**


100%|██████████| 5774/5774 [12:15<00:00,  7.85it/s]


**Validation**


100%|██████████| 642/642 [01:21<00:00,  7.87it/s]


Epochs: 3 | Train Loss:  0.329         | Train Accuracy:  27.468         | Val Loss:  0.340         | Val Accuracy:  27.305


In [None]:
# test bert_clf
bert_rocauc = test_bert(bert_clf, test_loader)

100%|██████████| 12831/12831 [04:20<00:00, 49.33it/s]


ROC-AUC score: 0.9255199292571382


# FastText

In [None]:
!pip install fasttext

In [None]:
import fasttext

In [None]:
def test_fasttext(model, X_test_ft, y_test):
  preds = model.predict(list(X_test_ft))
  preds_proba = [score if '1' in label[0] else 1 - score for label, score in zip(preds[0], preds[1])]
  return calculate_roc_auc(y_test, preds_proba)

In [None]:
# define cleaner
filters = [
           gsp.strip_tags,  # remove tags 
           gsp.strip_punctuation,  # remove punctuation
           gsp.strip_multiple_whitespaces,  # standarized the spaces 
           gsp.strip_numeric,
           gsp.remove_stopwords,  # stop words  
           gsp.strip_short,  # delete words with len < 3
          ]
cleaner = Cleaner(filters)

In [None]:
# prepare data
X_train_ft = cleaner.fit_transform(X_train)
X_test_ft = cleaner.fit_transform(X_test)

In [None]:
# create file with data
with open('ft_data_train', 'w') as f:
  f.write('\n'.join([f'__label__{label} {text}' for text, label in zip(X_train_ft, y_train)]))

In [None]:
# define and train fasttext model
fasttext_model = fasttext.train_supervised(input='ft_data_train')

In [None]:
# test fasttext model
fasttext_roc_auc = test_fasttext(fasttext_model, X_test_ft, y_test)

ROC-AUC score: 0.9553688927442155


# Sentiment analysis with textBlob

In [None]:
from textblob import TextBlob

In [None]:
def textblob_sentiment(X, threshold):
  y_pred, y_proba0, y_proba1, y_proba = [], [], [], []
  for text in list(X):
    testimonial = TextBlob(text)
    pol = testimonial.sentiment.polarity
    if pol > threshold:
      y_pred.append(1)
      y_proba1.append([1 - pol / (2), pol / (2)])
      y_proba.append(y_proba1[-1])
    else:
      y_pred.append(0)
      y_proba0.append([pol / (-2), 1 - pol / (-2)])
      y_proba.append(y_proba0[-1])

  return np.array(y_pred), np.array(y_proba)

def evaluate_textblob(X, y, t=0):
  y_pred, y_proba = textblob_sentiment(X, t)
  rocauc = roc_auc_score(y, y_proba[:, 1])
  print('ROC-AUC score', rocauc)
  return rocauc

In [None]:
textblob_rocauc = evaluate_textblob(X_test, y_test)

ROC-AUC score 0.5752705214186468


0.5752705214186468

# BP-embeddings

In [None]:
!pip install bpemb

Collecting bpemb
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.2 MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.3 sentencepiece-0.1.96


In [None]:
from bpemb import BPEmb

In [None]:
def validate_bpemb(val_loader, net, criterion):
  print('**Validation**')
  total_acc_val, total_loss_val = 0, 0

  with torch.no_grad():
    for texts, labels in tqdm(val_loader):
      labels = torch.tensor(labels.float()).to(device)

      output = net(texts)

      batch_loss = criterion(output, labels)
      total_loss_val += batch_loss.item()

      output_round = torch.tensor([[*map(lambda x: round(float(x)), pred)] for pred in output])
      acc = (output_round.to(device) == labels).sum().item()
      total_acc_val += acc
    
  return total_acc_val, total_loss_val

def train_bpemb(net, criterion, opti, train_loader, val_loader, epochs, path):
    net = net.to(device)
    criterion = criterion.to(device)
    for ep in range(epochs):
      total_acc_train, total_loss_train = 0, 0
      print(f'**Epoch {ep}**')
      with torch.enable_grad():        
        for texts, labels in tqdm(train_loader):
          opti.zero_grad()  

          labels = torch.tensor(labels.float()).to(device)

          output = net(texts)

          loss = criterion(output, labels)
          total_loss_train += loss.item()

          loss.backward()
          opti.step()

          output_round = torch.tensor([[*map(lambda x: round(float(x)), pred)] for pred in output])
          acc = (output_round.to(device) == labels).sum().item()
          total_acc_train += acc

      total_acc_val, total_loss_val = validate_bpemb(val_loader, net, criterion)
      torch.save(net.state_dict(), f'{path}epoch{ep}')
      print(
        f'Epochs: {epochs + 1} | Train Loss: {total_loss_train / len(train_loader): .3f} \
        | Train Accuracy: {total_acc_train / len(train_loader) * 2: .3f} \
        | Val Loss: {total_loss_val / len(val_loader): .3f} \
        | Val Accuracy: {total_acc_val / len(val_loader) * 2: .3f}')
      

def test_bpemb(net, test_loader):
  pred, true = [], []
  with torch.no_grad():
    for texts, labels in tqdm(test_loader):
      labels = torch.tensor(labels.float()).to(device)
      output = net(texts)
      true.append(float(labels[0]))
      pred.append(float(output[0][1]))
  return calculate_roc_auc(true, pred)

In [None]:
class BPEmbDataset(torch.utils.data.Dataset):
  def __init__(self, X, y):
    self.texts = list(X)
    self.labels = list(y)

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.texts)

  def get_batch_labels(self, idx):
    # Fetch a batch of labels
    return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
    # Fetch a batch of input
    return self.texts[idx]

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)

    return batch_texts, batch_y


class BPEmbClassifier(nn.Module):

    def __init__(self):

        super(BPEmbClassifier, self).__init__()

        self.bpemb_en = BPEmb(lang="en", dim=300)
        self.fc1 = nn.Linear(300, 100)
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    def forward(self, input):
        x = torch.Tensor([np.average(self.bpemb_en.embed(text), axis=0).reshape(300,1) for text in input]).squeeze(-1).to(device)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)

        return self.sigmoid(x)

In [None]:
# define cleaner
def drop_empty(X):
  return [text for text in X if len(text) != 0]

filters = [
           gsp.strip_tags,  # remove tags 
           gsp.strip_punctuation,  # remove punctuation
           gsp.strip_multiple_whitespaces,  # standarized the spaces 
           gsp.strip_numeric,
           gsp.remove_stopwords,  # stop words  
           gsp.stem_text  # stemming 
          ]

cleaner = Cleaner(filters)

In [None]:
# prepare data
y_train_bpemb = [[0., 1.] if label == 1 else [1., 0.] for label in y_train]
X_train_bpemb, X_val_bpemb, y_train_bpemb, y_val_bpemb = train_test_split(X_train, y_train_bpemb, 
                                                            test_size=0.2, stratify=y_train_bpemb, 
                                                            random_state=42)

X_train_bpemb = drop_empty(cleaner.fit_transform(X_train_bpemb))
X_val_bpemb = drop_empty(cleaner.fit_transform(X_val_bpemb))
X_test_bpemb = drop_empty(cleaner.fit_transform(X_test))

100%|██████████| 41056/41056 [00:11<00:00, 3444.22it/s]
100%|██████████| 10265/10265 [00:02<00:00, 3529.89it/s]
100%|██████████| 12831/12831 [00:03<00:00, 3308.94it/s]


In [None]:
# define loaders
train_bpemb_set = BPEmbDataset(X_train_bpemb, y_train_bpemb)
val_bpemb_set = BPEmbDataset(X_val_bpemb, y_val_bpemb)
test_bpemb_set = BPEmbDataset(X_test_bpemb, y_test)

train_loader = torch.utils.data.DataLoader(train_bpemb_set, batch_size=8)
val_loader = torch.utils.data.DataLoader(val_bpemb_set, batch_size=8)
test_loader = torch.utils.data.DataLoader(test_bpemb_set, batch_size=1)

In [None]:
# define model, optimizer, criterion
EPOCHS = 10
bpemb_clf = BPEmbClassifier().to(device)

criterion = nn.BCELoss()
opti = optim.Adam(bpemb_clf.parameters(), lr=0.01)

In [None]:
# train bpemb_clf
train_bpemb(bpemb_clf, criterion, opti, train_loader, val_loader, EPOCHS, '/content/drive/MyDrive/bpemd_classifier/')

**Epoch 0**


100%|██████████| 5129/5129 [00:31<00:00, 164.21it/s]


**Validation**


  import sys
100%|██████████| 1282/1282 [00:06<00:00, 208.17it/s]


Epochs: 11 | Train Loss:  0.638         | Train Accuracy:  21.306         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 1**


100%|██████████| 5129/5129 [00:31<00:00, 164.54it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 206.95it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.329         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 2**


100%|██████████| 5129/5129 [00:31<00:00, 164.38it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 205.27it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.328         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 3**


100%|██████████| 5129/5129 [00:31<00:00, 163.92it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 204.68it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.329         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 4**


100%|██████████| 5129/5129 [00:31<00:00, 164.50it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 204.10it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.330         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 5**


100%|██████████| 5129/5129 [00:31<00:00, 163.77it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 208.41it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.330         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 6**


100%|██████████| 5129/5129 [00:31<00:00, 163.17it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 208.65it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.330         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 7**


100%|██████████| 5129/5129 [00:31<00:00, 163.53it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 204.21it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.330         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 8**


100%|██████████| 5129/5129 [00:31<00:00, 163.12it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 202.05it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.330         | Val Loss:  0.637         | Val Accuracy:  21.329
**Epoch 9**


100%|██████████| 5129/5129 [00:31<00:00, 162.23it/s]


**Validation**


100%|██████████| 1282/1282 [00:06<00:00, 201.10it/s]


Epochs: 11 | Train Loss:  0.637         | Train Accuracy:  21.330         | Val Loss:  0.637         | Val Accuracy:  21.329


In [None]:
# test bpemb_clf
bp_rocauc = test_bpemb(bpemb_clf, test_loader)

100%|██████████| 12821/12821 [00:13<00:00, 958.34it/s]

ROC-AUC score: 0.5





# Comparison

In [None]:
results = [tfidf_rocauc, word2vec_rocauc, bert_rocauc, fasttext_roc_auc, textblob_rocauc, bp_rocauc]
names = ['TF-IDF & Log-Reg', 'Word2vec & Log-Reg', 'Bert', 'FastTexts', 'TextBlob (sentiment polarity)', 'BytePair-Embeddings']

pd.DataFrame(results, index=names, columns=['ROC-AUC']).sort_values(by=['ROC-AUC'], ascending=False)

Unnamed: 0,ROC-AUC
TF-IDF & Log-Reg,0.963595
FastTexts,0.955369
Word2vec & Log-Reg,0.931763
Bert,0.92552
TextBlob (sentiment polarity),0.575271
BytePair-Embeddings,0.5
