In [1]:
import os
import argparse
import re
from tqdm import tqdm
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [5]:
!pip install transformers > /tmp/null

In [6]:
from transformers import DistilBertTokenizer, DistilBertModel

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cpu':
    print('Consider changing to **GPU**; otherwise training is too slow.')
device = torch.device(device)

In [8]:
csv_file_path = 'headlines_sample_50k.csv'
df = pd.read_csv(csv_file_path)
df.head()

Unnamed: 0,title,category
0,WHO: 7 milion dead due to air pollution,m
1,Diane Sawyer steps away in ABC anchor shuffle,e
2,US denies knowledge of 'Heartbleed' bug,t
3,Tesla's New Jersey Sales Back On – For Now,b
4,"Now, Wikipedia to get flu-related data for you",m


In [9]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])
df.head()

Unnamed: 0,title,category,label
0,WHO: 7 milion dead due to air pollution,m,2
1,Diane Sawyer steps away in ABC anchor shuffle,e,1
2,US denies knowledge of 'Heartbleed' bug,t,3
3,Tesla's New Jersey Sales Back On – For Now,b,0
4,"Now, Wikipedia to get flu-related data for you",m,2


In [10]:
df.shape

(50000, 3)

In [11]:
# Clean the texts
# Remove multiple spaces to just one & trim spaces from lhs & rhs:
# No need to `import re` for this.
df['title'] = df['title'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [12]:
sample_txt = '   hi    there  ! '
re.sub(r'\s+', ' ', sample_txt).strip()

'hi there !'

In [13]:
"""
e: entertainment
b: business
t: tech & science
m: medicine & health
""";

In [14]:
# # create a small sample for first pipeline check:
# df = df.sample(frac=.1, random_state=19)
# df = df.reset_index(drop=True)
# df.shape

In [15]:
class NewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.data = df
        self.tokenizer = tokenizer
        self.max_length = max_len
        self.classes = set(df['label'].values)

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        label = self.data['label'].iloc[idx]
        txt = self.data['title'].iloc[idx]
        inputs = self.tokenizer.encode_plus(
            txt,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',  # pad to max_length
            truncation=True,
            return_tensors='pt',

        )
        ids = inputs['input_ids'].squeeze()  # (1, max_length) => (max_length,)
        msk = inputs['attention_mask'].squeeze()

        res = {
            # UserWarning: To copy construct from a tensor, it is recommended
            # to use sourceTensor.clone().detach() rather than torch.tensor(sourceTensor).
            # 'ids': torch.tensor(ids, dtype=torch.long),
            'ids': ids.clone().detach(),
            'mask': msk.clone().detach(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
        return res


In [16]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')



In [17]:
df_trn, df_tst = train_test_split(df, test_size=.5, random_state=19)
df_trn.shape[0], df_tst.shape[0]

(25000, 25000)

In [18]:
data_trn = NewsDataset(df_trn, tokenizer, max_len=128)
data_tst = NewsDataset(df_tst, tokenizer, max_len=128)

In [19]:
# you could also use `multiprocessing` module:
# multiprocessing.cpu_count()
os.cpu_count()

2

In [20]:
num_workers = 0 if len(data_trn) < 5e4 else os.cpu_count()
loader_params = {'batch_size': 4, 'num_workers': num_workers}

loader_trn = DataLoader(data_trn, shuffle=True, **loader_params)
loader_tst = DataLoader(data_tst, shuffle=False, **loader_params)

In [21]:
# dataiter = iter(loader_trn)
# idx, data = next(dataiter)

In [22]:
class NewsClsDistilBERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.dbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc1 = nn.Linear(768, 128)
        self.dropout = nn.Dropout(.3)
        self.cls = nn.Linear(128, len(data_trn.classes))

    def forward(self, input_ids, att_msk):
        out = self.dbert(input_ids=input_ids, attention_mask=att_msk)
        hidden_state = out[0]
        pooler = hidden_state[:, 0]
        x = F.relu(self.fc1(pooler))
        x = self.dropout(x)
        return self.cls(x)


In [23]:
def train_1epoch(model, data_loader, optimizer, criterion, epoch, device):
    tr_loss = 0
    n_correct = 0
    n_steps = 0
    n_examples = 0

    model.train()
    for idx, data in enumerate(data_loader, start=1):
        ids = data['ids'].to(device, dtype=torch.long)
        msk = data['mask'].to(device, dtype=torch.long)
        labels = data['labels'].to(device, dtype=torch.long)

        outputs = model(ids, msk)
        loss = criterion(outputs, labels)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += (big_idx == labels).sum().item()

        n_steps += 1
        n_examples += labels.size(0)

        if idx % 1000 == 0:
            step_loss = tr_loss / n_steps
            step_acc = (n_correct * 100) / n_examples
            print(
                f'\tAfter {n_examples} examples >>>',
                f'\tTrain Loss: {step_loss:.4f} -- ',
                f'\tTrain Acc : {step_acc:.2f}'
            )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    loss_epoch = tr_loss / n_steps
    acc_epoch = (n_correct * 100) / n_examples
    print('-' * 50)
    print(f'Train Loss: {loss_epoch:.4f}')
    print(f'Train Acc: {acc_epoch:.2f}')


In [24]:
def eval_1epoch(model, data_loader, criterion, epoch, device):
    n_correct = 0
    running_loss = 0
    n_steps = 0
    n_examples = 0

    with torch.inference_mode():
        for idx, data in enumerate(data_loader, start=1):
            ids = data['ids'].to(device, dtype=torch.long)
            msk = data['mask'].to(device, dtype=torch.long)
            labels = data['labels'].to(device, dtype=torch.long)

            outputs = model(ids, msk).squeeze()
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += (big_idx == labels).sum().item()

            n_steps += 1
            n_examples += labels.size(0)

            # if (idx % 1000 == 0):
            #     loss_step = running_loss / n_steps
            #     acc_step = (n_correct * 100) / n_examples
            #     print(f'After {n_examples} >>>')
            #     print(f'Train Loss: {loss_step:.4f}')
            #     print(f'Train Acc : {acc_step:.2f}')

    loss_epoch = running_loss / n_steps
    acc_epoch  = (n_correct * 100) / n_examples
    print('-' * 50)
    print(f'Eval Loss: {loss_epoch:.4f}')
    print(f'Eval Acc: {acc_epoch:.2f}\n')


In [25]:
model = NewsClsDistilBERT()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

In [26]:
def train_eval(model, tokenizer, train_loader, test_loader, epochs, device):
    model = model.to(device)
    for epoch in range(1, epochs+1):
        print(f'Epoch [{epoch}/{epochs}] =============================')
        train_1epoch(model, train_loader, optimizer, criterion, epoch, device)
        eval_1epoch(model, test_loader, criterion, epoch, device)


In [27]:
train_eval(model, tokenizer,
           loader_trn, loader_tst,
           epochs=1,
           device=device)

	After 4000 examples>>> 	Train Loss: 0.5873 --  	Train Acc : 81.45
	After 8000 examples>>> 	Train Loss: 0.4884 --  	Train Acc : 84.45
	After 12000 examples>>> 	Train Loss: 0.4353 --  	Train Acc : 85.98
	After 16000 examples>>> 	Train Loss: 0.4031 --  	Train Acc : 87.02
	After 20000 examples>>> 	Train Loss: 0.3791 --  	Train Acc : 87.73
	After 24000 examples>>> 	Train Loss: 0.3617 --  	Train Acc : 88.25
--------------------------------------------------
Train Loss: 0.3580
Train Acc: 88.35
--------------------------------------------------
Eval Loss: 0.2649
Eval Acc: 91.51



In [28]:
"""
def main():
    #@TODO: we could pass arguments as well. Instead of hardcoding in the notebook.
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--learning_rate', type=float, default=1e-3)
    args = parser.parse_args()

    train_eval(model, tokenizer, loader_trn, loader_tst, epochs=1, device=device)

    output_dir = Path(os.environ['SM_MODEL_DIR'])
    # output_model_file = os.path.join(output_dir, 'pytorch_distilbert_news.bin')
    path_model = output_dir / 'pytorch_distilbert_news.bin'
    path_vocab = output_dir / 'vocab_distilbert_news.bin'

    torch.save(model.state_dict(), path_model)
    tokenizer.save_vocabulary(path_vocab)


if __name__ == '__main__':
    main()

""";