In [36]:
!pip install transformers==3

Collecting transformers==3
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 5.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.5 MB/s 
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 36.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.1 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.46 sentencepiece-0.1.96 tokenizers-0.8.0rc4 transformers-3.0.0


In [39]:
from transformers import BertTokenizer, BertModel
 
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, Dataset

## Dataset
Data range from 2008 to 2016 and the data frame 2000 to 2008 was scrapped from yahoo finance. 

Labels are based on the Dow Jones Industrial Average stock index.

* Class 1 – The stock price increased.
* Class 0 – The stock price stayed the same or decreased.

In [42]:
df = pd.read_csv('/content/stock-stentiment-news.csv', usecols = ['Text','Date', 'Label'],)

In [43]:
df.head()

Unnamed: 0,Text,Date,Label
0,A 'hindrance to operations': extracts from the...,2000-01-03,0
1,Scorecard The best lake scene Leader: German s...,2000-01-04,0
2,Coventry caught on counter by Flo United's riv...,2000-01-05,0
3,Pilgrim knows how to progress Thatcher facing ...,2000-01-06,1
4,Hitches and Horlocks Beckham off but United su...,2000-01-07,1


## BERT Tokenizer

The `tokenizer.encode_plus` function combines multiple steps for us:

1. Split the sentence into tokens.
2. Add the special `[CLS]` and `[SEP]` tokens.
3. Map the tokens to their IDs.
4. Pad or truncate all sentences to the same length.
5. Create the `attention masks` which explicitly differentiate real tokens from `[PAD]` tokens.


**Reference**
* Utilities for Tokenizers `encode_plus()`: [Docs](https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus)

In [40]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Set the Device

In [41]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [44]:
class StockSentiment(Dataset):
    def __init__(self, mode, filepath, tokenizer, max_len=256):
        assert mode in ['train', 'val']
        
        self.mode = mode
        self.df = pd.read_csv(filepath, usecols = ['Text','Date', 'Label'])
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.len = len(self.df)
        self.train_len = int(self.len * 0.8)

        if mode == 'train':
            self.df = self.df[: self.train_len]
            print('train size:', len(self.df))
        else: 
            self.df = self.df[self.train_len:]
            print('validation size:', len(self.df))
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        text = self.df.Text.str.replace(r'(<.*\/>)', '').iloc[idx]
        
        label = self.df.Label.iloc[idx]
        
        inputs = self.tokenizer.encode_plus(
            text=text,
            text_pair=None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.float)
        }

## Datasets & DataLoader

In [47]:
train_dataset = StockSentiment('train', '/content/stock-stentiment-news.csv', tokenizer)
test_dataset = StockSentiment('val', '/content/stock-stentiment-news.csv', tokenizer)

train_dataloader = DataLoader(train_dataset, 16, shuffle=True)
test_dataloader = DataLoader(test_dataset, 16, shuffle=True)

train size: 3120
validation size: 781


## BERT Model

In [51]:
class BERTModel(torch.nn.Module):
    def __init__(self, dropout_p=0.3):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(dropout_p)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output = self.dropout(output)
        output = self.classifier(output)
        return output

In [52]:
model = BERTModel()
model.to(device)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [53]:
loss_func = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr=1e-4
)

# Save & Load Checkpoint Function

In [59]:
def load_ckp(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

In [58]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

## Train & Validation Function

In [62]:
def train(model, train_loader, optimizer, loss_func, device):
    train_loss = 0
    model.train()
    
    for b_idx, data in enumerate(train_loader):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['targets'].to(device)

        outputs = model(ids, mask, token_type_ids)
        
        optimizer.zero_grad()
        loss = loss_func(outputs, targets.unsqueeze(1))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    return train_loss/len(train_loader.sampler)

In [60]:
def validation(model, valid_loader, optimizer, loss_func, device):
    valid_loss = 0
    with torch.no_grad():
        for b_idx, data in enumerate(valid_loader):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)
            
            outputs = model(ids, mask, token_type_ids)

            loss = loss_func(outputs, targets.unsqueeze(1))
            valid_loss += loss.item()
    
    return valid_loss/len(valid_loader.sampler)

## Train Model

In [63]:
min_valid_loss = None
checkpoint_path = './current_checkpoint.pt'
best_model_path = './best_model.pt'

for epoch in range(10):
    train_loss = train(model, train_dataloader, optimizer, loss_func, device)
    valid_loss = validation(model, test_dataloader, optimizer, loss_func, device)
    
    print('Epoch: {} \n\t - Avgerage Training Loss: {:.6f} \n\t - Average Validation Loss: {:.6f}'.format(
            epoch + 1, 
            train_loss,
            valid_loss
    ))

    if min_valid_loss is None:
        min_valid_loss = train_loss
    
    if valid_loss < min_valid_loss:
        # create checkpoint variable and add important data
        checkpoint = {
                'epoch': epoch + 1,
                'valid_loss_min': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
        }
        
        print('** Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(min_valid_loss, valid_loss))
        
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        min_valid_loss = valid_loss

Epoch: 1 
	 - Avgerage Training Loss: 0.043848 
	 - Average Validation Loss: 0.043926
Epoch: 2 
	 - Avgerage Training Loss: 0.043827 
	 - Average Validation Loss: 0.043950
Epoch: 3 
	 - Avgerage Training Loss: 0.043544 
	 - Average Validation Loss: 0.043651
** Validation loss decreased (0.043848 --> 0.043651).  Saving model ...
Epoch: 4 
	 - Avgerage Training Loss: 0.044907 
	 - Average Validation Loss: 0.044243
Epoch: 5 
	 - Avgerage Training Loss: 0.044219 
	 - Average Validation Loss: 0.044551
Epoch: 6 
	 - Avgerage Training Loss: 0.044122 
	 - Average Validation Loss: 0.044310
Epoch: 7 
	 - Avgerage Training Loss: 0.044089 
	 - Average Validation Loss: 0.044173
Epoch: 8 
	 - Avgerage Training Loss: 0.044157 
	 - Average Validation Loss: 0.043872
Epoch: 9 
	 - Avgerage Training Loss: 0.044001 
	 - Average Validation Loss: 0.043790
Epoch: 10 
	 - Avgerage Training Loss: 0.044329 
	 - Average Validation Loss: 0.044503
