In [2]:
!pip install transformers
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/PytorchLightning/pytorch-lightning.git@master
  Cloning https://github.com/PytorchLightning/p

In [3]:
# Import required libraries

import pandas as pd
import re
import sklearn
import nltk
from sklearn.model_selection import train_test_split
from google.colab import drive


from transformers import DistilBertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl

In [4]:
# Mount your Google Drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# Read SMS message dataset

data = pd.read_csv("gdrive/MyDrive/spam_detection_data.csv")
data = data[["SMS", "label"]] # Take only the SMS and label columns
data.head()

Unnamed: 0,SMS,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
# Clean the text

nltk.download("stopwords")
def clean_text(text):
  stopwords = nltk.corpus.stopwords.words('english')
  text = text.lower() # Convert to lower case
  text = re.sub(r'[^\w\s]', '', text) # Remove everything except words
  words = [word for word in text.split() if word not in stopwords] # Remove stopwords
  text = " ".join(words)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
data["SMS"] = data["SMS"].apply(clean_text)

In [8]:
x = data["SMS"].values
y = data["label"].values

# Split into training and validation sets

train_data, val_data, train_labels, val_labels = train_test_split(x, y)

In [9]:
# Load pre-trained DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [14]:
# Tokenize the SMS messages

train_tokens = tokenizer(list(train_data), return_tensors="pt", padding=True, truncation=True, max_length=64)
val_tokens = tokenizer(list(val_data), return_tensors="pt", padding=True, truncation=True, max_length=64)

In [15]:
# Create lists of tokens

device = "cuda"
trn = [train_tokens["input_ids"].to(device), train_tokens["attention_mask"].to(device), torch.tensor(train_labels).to(device)]
val = [val_tokens["input_ids"].to(device), val_tokens["attention_mask"].to(device), torch.tensor(val_labels).to(device)]

In [16]:
# Dataloader class

BATCH_SIZE = 32
class ClassificationData(pl.LightningDataModule):
    def __init__(self, trn, val):
        super().__init__()

        self.trn = DataLoader(TensorDataset(*trn), batch_size=BATCH_SIZE)
        self.val = DataLoader(TensorDataset(*val), batch_size=BATCH_SIZE)

    def train_dataloader(self): return self.trn
    def val_dataloader(self): return self.val

dls = ClassificationData(trn, val)

In [17]:
# This should return a list of 4 tensors - input_ids, attention_masks, token_type_ids, and labels
next(iter(dls.trn))

[tensor([[  101,  4684,  2340,  ...,     0,     0,     0],
         [  101,  2731,  4826,  ...,     0,     0,     0],
         [  101,  2783,  2877,  ...,     0,     0,     0],
         ...,
         [  101,  9932, 13900,  ...,     0,     0,     0],
         [  101,  2551, 13060,  ...,     0,     0,     0],
         [  101,  2113, 21547,  ...,     0,     0,     0]], device='cuda:0'),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 tensor([1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')]

In [10]:
from transformers import DistilBertModel
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# The DistilBertClassifier class

#import torch
#import pytorch_lightning as pl

class DistilBertClassifier(pl.LightningModule):
    def __init__(self, dropout_p, hid_dim, output_dim):
        super().__init__()
        self.distilbert = distilbert_model
        self.dropout = torch.nn.Dropout(dropout_p)
        self.linear_1 = torch.nn.Linear(hid_dim,hid_dim)
        self.linear_2 = torch.nn.Linear(hid_dim, output_dim)
        self.loss = torch.nn.NLLLoss()

    def forward(self, input_ids, attention_mask):
        x1 = self.distilbert(input_ids, attention_mask=attention_mask)[0]
        x1 = x1[:,0]
        x1 = self.dropout(torch.nn.ReLU()(self.linear_1(x1)))
        output  = torch.log_softmax(self.linear_2(x1), dim = 1)
        return output

    def training_step(self, batch, ix):
        pred = self(batch[0], batch[1])
        loss = self.loss(pred, batch[2].view(-1))
        return loss

    def validation_step(self, batch, ix):
        pred = self(batch[0], batch[1])
        loss = self.loss(pred, batch[2].view(-1))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

m = DistilBertClassifier(0.5, 768, 2)

In [18]:
# Train the model

# dls is the object of the dataloader class created in the previous milestone
device = "cuda"
t = pl.Trainer(max_epochs=1, gpus=1)
t.fit(m.to(device), dls)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type            | Params
-----------------------------------------------
0 | distilbert | DistilBertModel | 66.4 M
1 | dropout    | Dropout         | 0     
2 | linear_1   | Linear          | 590 K 
3 | linear_2   | Linear          | 1.5 K 
4 | loss       | NLLLoss         | 0     
-----------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [19]:
# This should print the model architecture
print(m)

DistilBertClassifier(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [20]:
# train_tokens and val_tokens were created in the previous milestone by tokenizing the input sentences

    # train_tokens = tokenizer(list(train_data), return_tensors="pt", padding=True, truncation=True, max_length=64)
    # val_tokens = tokenizer(list(val_data), return_tensors="pt", padding=True, truncation=True, max_length=64)

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_tokens, train_labels)
val_dataset = ClassificationDataset(val_tokens, val_labels)

In [21]:
# Train the model

training_args = TrainingArguments(
    output_dir='gdrive/MyDrive/sms_model_today',          # output directory
    num_train_epochs=1,              # total number of training epochs
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Trainer object 

trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,        
    eval_dataset=val_dataset             
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

Step,Training Loss
500,0.0887


Saving model checkpoint to gdrive/MyDrive/sms_model_today/checkpoint-500
Configuration saved in gdrive/MyDrive/sms_model_today/checkpoint-500/config.json
Model weights saved in gdrive/MyDrive/sms_model_today/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=523, training_loss=0.08755969568146803, metrics={'train_runtime': 39.5947, 'train_samples_per_second': 105.57, 'train_steps_per_second': 13.209, 'total_flos': 69214215797760.0, 'train_loss': 0.08755969568146803, 'epoch': 1.0})

In [22]:
# Predict probabilities on a validation batch 
val_batch = next(iter(dls.val))

device = "cuda"
m.to(device)

val_pred = m(val_batch[0], val_batch[1]) # m is the model created in previous milestone
val_label = val_pred.data.max(1)[1].cpu().numpy()

val_true = val_batch[2].reshape(BATCH_SIZE).cpu().numpy() # BATCH_SIZE is 32, as initialized in first milestone

In [24]:
# Calculate precision, recall and F1-score

f_score = sklearn.metrics.f1_score(val_true, val_label, average = "macro")
precision_score = sklearn.metrics.precision_score(val_true, val_label)
recall_score = sklearn.metrics.recall_score(val_true, val_label)

print("f_score", f_score)
print("precision_score", precision_score)
print("recall_score", recall_score)

f_score 0.8571428571428571
precision_score 0.6
recall_score 1.0
