# Text Classification

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 5.9MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 33.4MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 35.8MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Instal

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## Main

In [None]:
import time

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertModel,
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    AlbertTokenizer,
    AlbertForSequenceClassification,
)
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
print(f"Pytorch Version: {torch.__version__}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
if torch.cuda.device_count() > 0:
    print(f"Found GPU at: {torch.cuda.get_device_name(0)}")

DIRNAME = "drive/My Drive/Colab Notebooks/"

Pytorch Version: 1.8.1+cu101
Device: cuda
Found GPU at: Tesla K80


## Load data

In [None]:
data = pd.read_csv(DIRNAME + "data/financial_sentiment/adhoc_sentences.tsv", sep="\t")
data["sentence"] = data["sentence"].str.strip()  # strip trailing space
print(data.shape)
data.head()

(1000, 2)


Unnamed: 0,sentence,sentiment
0,In several projects the technical complexity h...,0
1,To return to profitability in this environment...,0
2,Investment advisory fees are expected to drop ...,1
3,The designated sponsor agreement with the BHF ...,0
4,"In addition, due to the persistence of the poo...",0


In [None]:
class CFG:
    """Configuration."""
    max_len = 256
    batch_size = 8
    epochs = 10
    lr = 5e-6  # 1e-5
    patience = 5
    n_classes = 2
    finetuned_model_path = DIRNAME + "models/finetuned_distilbert_fin.bin"
    # finetuned_model_path = DIRNAME + "models/finetuned_albert_fin.bin"
    # finetuned_model_path = DIRNAME + "models/finetuned_albert_large_fin.bin"


### Preparing the Dataset and Dataloader

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

# tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

# tokenizer = AlbertTokenizer.from_pretrained("albert-large-v2")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [None]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, labels=None):
        self.len = len(data)
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = self.data[index]
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",            
            return_token_type_ids=True,
        )

        output = {
            "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
        }
        if self.labels is not None:
            output["targets"] = torch.tensor(self.labels[index], dtype=torch.long)
        return output
    
    def __len__(self):
        return self.len
    

In [None]:
train_df, valid_df = train_test_split(data, test_size=0.2, random_state=0)

print(f"Train Dataset: {train_df.shape}")
print(f"Valid Dataset: {valid_df.shape}")

Train Dataset: (800, 2)
Valid Dataset: (200, 2)


In [None]:
train_data = TextDataset(
    train_df["sentence"].tolist(), tokenizer, CFG.max_len, labels=train_df["sentiment"].tolist())
train_loader = DataLoader(
    train_data, batch_size=CFG.batch_size, shuffle=True, num_workers=0)

valid_data = TextDataset(
    valid_df["sentence"].tolist(), tokenizer, CFG.max_len, labels=valid_df["sentiment"].tolist())
valid_loader = DataLoader(
    valid_data, batch_size=CFG.batch_size, shuffle=False, num_workers=0)

### Model

 - We will be creating a neural network with the `DistillBERTClass`. 
 - This network will have the DistilBERT Language model followed by a `dropout` and finally a `Linear` layer to obtain the final outputs. 
 - The data will be fed to the DistilBERT Language model as defined in the dataset. 
 - Final layer outputs is what will be compared to the `encoded category` to determine the accuracy of models prediction. 
 - We will initiate an instance of the network called `model`. This instance will be used for training and then to save the final trained model for future inference.

In [None]:
# # Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

# class DistillBERTClass(torch.nn.Module):
#     def __init__(self):
#         super(DistillBERTClass, self).__init__()
#         self.l1 = DistilBertModel.from_pretrained("distilbert-base-cased")
#         self.l2 = torch.nn.Dropout(0.3)
#         self.l3 = torch.nn.Linear(768, 1)
    
#     def forward(self, ids, mask):
#         output_1 = self.l1(ids, mask)
#         output_2 = self.l2(output_1[0])
#         output = self.l3(output_2)
#         return output

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-cased", num_labels=CFG.n_classes)

# model = AlbertForSequenceClassification.from_pretrained(
#     "albert-base-v2", num_labels=CFG.n_classes)

# model = AlbertForSequenceClassification.from_pretrained(
#     "albert-large-v2", num_labels=CFG.n_classes)

model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
# param_optimizer = list(model.named_parameters())
# no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
# optimizer_grouped_parameters = [
#     {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#      "weight_decay": 0.01},
#     {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#      "weight_decay":0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

# total_steps = len(train_data_loader) * EPOCHS

# scheduler = get_linear_schedule_with_warmup(
#   optimizer,
#   num_warmup_steps=0,
#   num_training_steps=total_steps
# )

## Fine Tuning the Model

In [None]:
def train_fn(cfg, model, train_loader, valid_loader, device):
    """Train function."""
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", factor=0.5, patience=2, verbose=True, eps=1e-6)
    
    best_loss = np.inf
    counter = 0

    for epoch in range(cfg.epochs):

        start_time = time.time()

        model.train()
        train_loss = 0.
        y_train = list()
        pred_train = list()

        optimizer.zero_grad()

        for data in train_loader:
            ids = data["ids"].to(device)
            mask = data["mask"].to(device)
            targets = data["targets"].to(device)

            outputs = model(ids, attention_mask=mask, labels=targets)
            loss, logits = outputs[:2]

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item() / len(train_loader)

            y_train.append(targets.cpu().numpy())
            pred_train.append(logits.detach().cpu().numpy().argmax(axis=1))

        y_train = np.concatenate(y_train)
        pred_train = np.concatenate(pred_train)
        train_acc = metrics.accuracy_score(y_train, pred_train)

        model.eval()
        valid_loss = 0.
        y_valid = list()
        pred_valid = list()

        for data in valid_loader:
            ids = data["ids"].to(device)
            mask = data["mask"].to(device)
            targets = data["targets"].to(device)

            with torch.no_grad():
                outputs = model(ids, attention_mask=mask, labels=targets)
                loss, logits = outputs[:2]

            valid_loss += loss.item() / len(valid_loader)

            y_valid.append(targets.cpu().numpy())
            pred_valid.append(logits.cpu().numpy().argmax(axis=1))

        scheduler.step(valid_loss)

        y_valid = np.concatenate(y_valid)
        pred_valid = np.concatenate(pred_valid)
        valid_acc = metrics.accuracy_score(y_valid, pred_valid)

        print(f"Epoch {epoch + 1}/{cfg.epochs}: elapsed time: {time.time() - start_time:.0f}s\n"
              f"  loss: {train_loss:.4f}  train_acc: {train_acc:.4f}"
              f" - valid_loss: {valid_loss:.4f}  valid_acc: {valid_acc:.4f}")

        if valid_loss < best_loss:
            print(f"Epoch {epoch + 1}: valid_loss improved from {best_loss:.5f} to {valid_loss:.5f}, "
                  f"saving model to {cfg.finetuned_model_path}")
            best_loss = valid_loss
            counter = 0
            torch.save(model.state_dict(), cfg.finetuned_model_path)
        else:
            print(f"Epoch {epoch + 1}: valid_loss did not improve from {best_loss:.5f}")
            counter += 1
            if counter == cfg.patience:
                break

In [None]:
# Distilbert
train_fn(CFG, model, train_loader, valid_loader, device)

Epoch 1/10: elapsed time: 41s
  loss: 0.0750  train_acc: 0.9812 - valid_loss: 0.5957  valid_acc: 0.8200
Epoch 1: valid_loss improved from inf to 0.59572, saving model to drive/My Drive/Colab Notebooks/models/finetuned_distilbert_fin.bin
Epoch 2/10: elapsed time: 41s
  loss: 0.0494  train_acc: 0.9838 - valid_loss: 0.6405  valid_acc: 0.7850
Epoch 2: valid_loss did not improve from 0.59572
Epoch 3/10: elapsed time: 41s
  loss: 0.0336  train_acc: 0.9938 - valid_loss: 0.6573  valid_acc: 0.8250
Epoch 3: valid_loss did not improve from 0.59572
Epoch     4: reducing learning rate of group 0 to 2.5000e-06.
Epoch 4/10: elapsed time: 41s
  loss: 0.0327  train_acc: 0.9925 - valid_loss: 0.6632  valid_acc: 0.8400
Epoch 4: valid_loss did not improve from 0.59572
Epoch 5/10: elapsed time: 41s
  loss: 0.0231  train_acc: 0.9950 - valid_loss: 0.6703  valid_acc: 0.8300
Epoch 5: valid_loss did not improve from 0.59572
Epoch 6/10: elapsed time: 41s
  loss: 0.0237  train_acc: 0.9938 - valid_loss: 0.6835  val

In [None]:
# Albert large v2
best_loss = np.inf
for epoch in range(CFG.epochs):
    best_loss = train_step(epoch, best_loss)

Epoch 1/10: elapsed time: 79s
  loss: 0.6159  train_acc: 0.6750 - val_loss: 0.5092  val_acc: 0.7850
Epoch 1: val_loss improved from inf to 0.50921, saving model to drive/My Drive/Colab Notebooks/models/finetuned_albert_large_fin.bin
Epoch 2/10: elapsed time: 79s
  loss: 0.4908  train_acc: 0.7775 - val_loss: 0.4660  val_acc: 0.8000
Epoch 2: val_loss improved from 0.50921 to 0.46603, saving model to drive/My Drive/Colab Notebooks/models/finetuned_albert_large_fin.bin
Epoch 3/10: elapsed time: 79s
  loss: 0.3132  train_acc: 0.8812 - val_loss: 0.4550  val_acc: 0.8050
Epoch 3: val_loss improved from 0.46603 to 0.45497, saving model to drive/My Drive/Colab Notebooks/models/finetuned_albert_large_fin.bin
Epoch 4/10: elapsed time: 79s
  loss: 0.2130  train_acc: 0.9300 - val_loss: 0.4603  val_acc: 0.8200
Epoch 4: val_loss did not improve from 0.45497
Epoch 5/10: elapsed time: 79s
  loss: 0.1810  train_acc: 0.9375 - val_loss: 0.4797  val_acc: 0.8100
Epoch 5: val_loss did not improve from 0.45497

## Validating the Model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-cased", num_labels=CFG.n_classes)
model.load_state_dict(torch.load(CFG.finetuned_model_path, map_location=device))
model.to(device)
model.eval()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.w

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
def evaluate(model, valid_loader):
    y_true = list()
    y_prob = list()
    with torch.no_grad():
        for  data in valid_loader:
            ids = data["ids"].to(device)
            mask = data["mask"].to(device)
            targets = data["targets"].to(device)
            
            logits = model(ids, attention_mask=mask)[0]
            probs = torch.softmax(logits, axis=1)
            
            y_true.append(targets.cpu().numpy())
            y_prob.append(probs[:, 1].cpu().numpy())

    y_true = np.concatenate(y_true)
    y_prob = np.concatenate(y_prob)
    return y_true, y_prob


def compute_metrics(y_true, y_prob):
    """Compute metrics."""
    y_pred = (y_prob > 0.5).astype(int)

    acc = metrics.accuracy_score(y_true, y_pred)
    roc_auc = metrics.roc_auc_score(y_true, y_prob)
    avg_prc = metrics.average_precision_score(y_true, y_prob)
    print(f"  Accuracy          = {acc:.4f}")
    print(f"  ROC AUC           = {roc_auc:.4f}")
    print(f"  Average precision = {avg_prc:.4f}\n")
    print(metrics.classification_report(y_true, y_pred, digits=4))


In [None]:
# Albert large v2

y_true, y_prob = evaluate(model, valid_loader)
compute_metrics(y_true, y_prob)

  Accuracy          = 0.8050
  ROC AUC           = 0.8785
  Average precision = 0.8751

              precision    recall  f1-score   support

           0     0.8119    0.8039    0.8079       102
           1     0.7980    0.8061    0.8020        98

    accuracy                         0.8050       200
   macro avg     0.8049    0.8050    0.8050       200
weighted avg     0.8051    0.8050    0.8050       200



In [None]:
# Albert v2

y_true, y_prob = evaluate(model, valid_loader)
compute_metrics(y_true, y_prob)

  Accuracy          = 0.7950
  ROC AUC           = 0.8919
  Average precision = 0.8692

              precision    recall  f1-score   support

           0     0.8211    0.7647    0.7919       102
           1     0.7714    0.8265    0.7980        98

    accuracy                         0.7950       200
   macro avg     0.7962    0.7956    0.7950       200
weighted avg     0.7967    0.7950    0.7949       200



In [None]:
# Distilbert

y_true, y_prob = evaluate(model, valid_loader)
compute_metrics(y_true, y_prob)

  Accuracy          = 0.8600
  ROC AUC           = 0.9120
  Average precision = 0.9103

              precision    recall  f1-score   support

           0     0.8854    0.8333    0.8586       102
           1     0.8365    0.8878    0.8614        98

    accuracy                         0.8600       200
   macro avg     0.8610    0.8605    0.8600       200
weighted avg     0.8615    0.8600    0.8600       200



### Saving the Trained Model Artifacts for inference

In [None]:
output_model_file = DIRNAME + "models/pytorch_distilbert_news.bin"
output_vocab_file = DIRNAME + "models/vocab_distilbert_news.bin"

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

('drive/My Drive/Colab Notebooks/models/vocab_distilbert_news.bin',)

In [None]:
torch.save(model.state_dict(), DIRNAME + "models/pytorch_distilbert_news2.bin")