In [1]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [4]:
NUM_EPOCHS = 3

In [5]:
url = ("https://github.com/rasbt/"
"machine-learning-book/raw/"
"main/ch08/movie_data.csv.gz")

filename = url.split("/")[-1]

with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [6]:
df = pd.read_csv(filename)
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [7]:
df.count

<bound method DataFrame.count of                                                   review  sentiment
0      In 1974, the teenager Martha Moxley (Maggie Gr...          1
1      OK... so... I really like Kris Kristofferson a...          0
2      ***SPOILER*** Do not read this, if you think a...          0
3      hi for all the people who have seen this wonde...          1
4      I recently bought the DVD, forgetting just how...          0
...                                                  ...        ...
49995  OK, lets start with the best. the building. al...          0
49996  The British 'heritage film' industry is out of...          0
49997  I don't even know where to begin on this one. ...          0
49998  Richard Tyler is a little boy who is scared of...          0
49999  I waited long to watch this movie. Also becaus...          1

[50000 rows x 2 columns]>

In [8]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values

valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values

test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

In [9]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings  = tokenizer(list(test_texts), truncation=True, padding=True)

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 199kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.07MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.69MB/s]
config.json: 100%|██████████| 483/483 [00:00<00:00, 3.47MB/s]


In [14]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
            for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        
        return item
    
    def __len__(self):
        return len(self.labels)

In [15]:
train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)


In [16]:
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=16, shuffle=True)

valid_loader = torch.utils.data.DataLoader(
valid_dataset, batch_size=16, shuffle=False)

test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=16, shuffle=False)

In [17]:
model = DistilBertForSequenceClassification.from_pretrained(
'distilbert-base-uncased')
model.to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

model.safetensors: 100%|██████████| 268M/268M [00:04<00:00, 59.9MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = \
            batch['attention_mask'].to(device)

            labels = batch['labels'].to(device)
            outputs = model(input_ids,attention_mask=attention_mask)
            
            logits = outputs['logits']
            
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            
            correct_pred += \
            (predicted_labels == labels).sum()
        return correct_pred.float()/num_examples * 100

In [19]:
start_time = time.time()
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        ### Forward pass
        outputs = model(input_ids,
        attention_mask=attention_mask,
        labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward pass
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 250:
            print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}'
            f' | Batch'
            f'{batch_idx:04d}/'
            f'{len(train_loader):04d} | '
            f'Loss: {loss:.4f}')
            model.eval()
            
    with torch.set_grad_enabled(False):
        print(f'Training accuracy: ' \
        f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
        f'\nValid accuracy: ' \
        f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')


print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0003 | Batch0000/2188 | Loss: 0.7000
Epoch: 0001/0003 | Batch0250/2188 | Loss: 0.2762
Epoch: 0001/0003 | Batch0500/2188 | Loss: 0.3582
Epoch: 0001/0003 | Batch0750/2188 | Loss: 0.0129
Epoch: 0001/0003 | Batch1000/2188 | Loss: 0.0873
Epoch: 0001/0003 | Batch1250/2188 | Loss: 0.0143
Epoch: 0001/0003 | Batch1500/2188 | Loss: 0.2263
Epoch: 0001/0003 | Batch1750/2188 | Loss: 0.1311
Epoch: 0001/0003 | Batch2000/2188 | Loss: 0.1856
Training accuracy: 96.50%
Valid accuracy: 92.02%
Time elapsed: 12.61 min
Epoch: 0002/0003 | Batch0000/2188 | Loss: 0.0497
Epoch: 0002/0003 | Batch0250/2188 | Loss: 0.0840
Epoch: 0002/0003 | Batch0500/2188 | Loss: 0.0934
Epoch: 0002/0003 | Batch0750/2188 | Loss: 0.0089
Epoch: 0002/0003 | Batch1000/2188 | Loss: 0.0790
Epoch: 0002/0003 | Batch1250/2188 | Loss: 0.2908
Epoch: 0002/0003 | Batch1500/2188 | Loss: 0.1807
Epoch: 0002/0003 | Batch1750/2188 | Loss: 0.2112
Epoch: 0002/0003 | Batch2000/2188 | Loss: 0.5125
Training accuracy: 98.70%
Valid accuracy: 92.