In [17]:
import warnings
import string
import joblib
import multiprocessing
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import defaultdict
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import BertTokenizer
from transformers import BertModel
from torch import nn
from torch.nn import functional as F

warnings.filterwarnings("ignore")

In [3]:
train = joblib.load("../data/train_top25_v3.bin")
valid = joblib.load("../data/valid_top25_v3.bin")
test = joblib.load("../data/test_top25_v3.bin")

train = train[train['content'].map(len) > 0]
valid = valid[valid['content'].map(len) > 0]
test = test[test['content'].map(len) > 0]

In [5]:
train.head()

Unnamed: 0_level_0,content,news_count,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2012-01-11,[Jan 4 (Reuters) - Apple Inc has appointed Ado...,1,29.16,29.39,28.96,29.32,3306300,0.0,0.0,ADBE,1
2012-01-20,[SAN FRANCISCO (Reuters) - In the summer of 20...,1,30.28,30.55,30.18,30.5,4091200,0.0,0.0,ADBE,1
2012-01-23,[SAN FRANCISCO (Reuters) - In the summer of 20...,1,30.33,30.78,29.98,30.23,5378200,0.0,0.0,ADBE,0
2012-01-24,[SAN FRANCISCO (Reuters) - In the summer of 20...,1,30.17,31.0,30.08,30.95,4715800,0.0,0.0,ADBE,1
2012-01-25,[SAN FRANCISCO (Reuters) - In the summer of 20...,1,30.59,31.4,30.51,31.34,5459800,0.0,0.0,ADBE,1


In [4]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [7]:
sample = tokenizer.encode_plus(
    train["content"][0][0], 
    max_length=32, 
    add_special_tokens=True, 
    return_token_type_ids=False, 
    pad_to_max_length=True, 
    return_attention_mask=True, 
    return_tensors="pt")
sample

{'input_ids': tensor([[  101,  5553,  1018,  1006, 26665,  1007,  1011,  6207,  4297,  2038,
          2805, 18106,  3001,  4297,  1521,  1055,  6927, 28774,  2229,  2072,
          2000,  2132,  2049, 24264,  2094,  4684,  1011,  6475,  2449,  1010,
         22950,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
len(train["content"][100])

6

In [20]:
class ReutersDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.targets = df.label.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        contents = self.df["content"][item]
        target = self.targets[item]

        enc_list = []
        for content in contents:
            enc = self.tokenizer.encode_plus(
                str(content),
                max_length=self.max_len,
                add_special_tokens=True,
                return_token_type_ids=False,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors="pt")
            enc["input_ids"] = enc["input_ids"].flatten()
            enc["attention_mask"] = enc["attention_mask"].flatten()
            enc_list.append(enc)

        return {
            "ids_and_mask": enc_list,
            "target": torch.tensor(target)
        }


def create_dataloader(df, tokenizer, max_len, batch_size, shuffle=True):
    dataset = ReutersDataset(
        df=df,
        tokenizer=tokenizer,
        max_len=max_len)

    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=0)

In [22]:
train_dataloader = create_dataloader(train, tokenizer, max_len=32, batch_size=4)
next(iter(train_dataloader))

{'ids_and_mask': [{'input_ids': tensor([[  101,  2624,  3799,  1010,  5553,  1019,  1006, 26665,  1007,  1011,
             9130,  4297,  3734,  1037,  2194,  2008,  3084,  2376,  5038,  2974,
             2005,  4929,  3085,  5733,  1998,  4274,  1011,  4198, 22449,  1010,
             1996,   102],
           [  101,  2624,  3799,  1006, 26665,  1007,  1011,  1062,  6038,  3654,
             4297,  5766,  2928,  9231,  7874,  2056,  9857,  2002,  3464, 15705,
             1997, 19920,  2004,  4600,  1999,  4684,  2399,  2004,  2002,  2038,
             1999,   102],
           [  101,  2047,  2259,  1010,  5553,  1022,  1006, 26665,  1007,  1011,
             1057,  1012,  1055,  1012,  7027, 13753,  5157,  3333,  2058,  1996,
             2197,  2048,  3134,  2004, 10216,  7597,  3123,  1010,  3040, 11522,
             2056,   102],
           [  101,  1008,  2117,  1011,  4284, 16565,  1013,  3745,  1002,  1014,
             1012,  6640,  5443,  9765,  1002,  1014,  1012,  6421,  1

In [8]:
class ReutersClassifier(nn.Module):
    def __init__(self, n_classes, p=0.25):
        super(ReutersClassifier, self).__init__()
        self.bert_layer = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.dropout = nn.Dropout(p=p)
        self.classifier = nn.Linear(self.bert_layer.config.dim, n_classes)
        
    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert_layer(
            input_ids=input_ids, 
            attention_mask=attention_mask)
        main = self.dropout(pooled_output[0][:, 0, :])
        return F.sigmoid(self.classifier(main))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
model = ReutersClassifier(n_classes=1)
model.to(device)
print("Model built!")

Model built!


In [9]:
probabilty = model(sample["input_ids"].to(device), sample["attention_mask"].to(device))
probabilty

tensor([[0.3549]], device='cuda:0', grad_fn=<SigmoidBackward>)

In [37]:
class ReutersLinearClassifierV2(nn.Module):

    def __init__(self, n_classes, p):
        super(ReutersLinearClassifierV2, self).__init__()
        self.PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
        self.distilbert_layer = AutoModel.from_pretrained(self.PRE_TRAINED_MODEL_NAME)
        self.dropout = nn.Dropout(p=p)
        self.fc_dropout = nn.Dropout(p=p)
        self.fc = nn.Linear(self.distilbert_layer.config.dim, self.distilbert_layer.config.dim)
        self.classifier = nn.Linear(self.distilbert_layer.config.dim, n_classes)

    def forward(self, ids_and_mask):
        pool_list = []
        for enc in ids_and_mask:
            pooled_output = self.distilbert_layer(
                input_ids=enc["input_ids"],
                attention_mask=enc["attention_mask"])
            branch = self.dropout(pooled_output[0][:, 0, :])
            pool_list.append(branch)
        init = torch.zeros(self.distilbert_layer.config.dim).to(device)
        for branch in pool_list:
            concat = torch.add(init, branch)
        concat = self.fc(concat)
        return self.classifier(self.fc_dropout(F.relu(concat)))

    def freeze_bert_encoder(self):
        for param in self.distilbert_layer.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.distilbert_layer.parameters():
            param.requires_grad = True
            
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
model = ReutersLinearClassifierV2(n_classes=2, p=0.1)
model.to(device)
print("Model built!")

Model built!


In [38]:
for i, data in enumerate(tqdm(train_dataloader)):
    for d in data["ids_and_mask"]:
        d["input_ids"] = d["input_ids"].to(device)
        d["attention_mask"] = d["attention_mask"].to(device)
    targets = data["target"].to(device)

    outputs = model(data["ids_and_mask"])
    outputs = F.softmax(outputs)
    _, preds = torch.max(outputs, dim=1)

HBox(children=(FloatProgress(value=0.0, max=1084.0), HTML(value='')))




In [41]:
outputs

tensor([[0.5026, 0.4974]], device='cuda:0', grad_fn=<SoftmaxBackward>)