In [1]:
import warnings
import logging
import joblib
import config
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)


from collections import defaultdict
from model import ReutersClassifier
from transformers import AutoTokenizer
from transformers import AutoModel
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch import nn
from torch.nn import functional as F
from preprocessor import load_data

I0716 14:14:55.108985  4752 file_utils.py:39] PyTorch version 1.2.0 available.
I0716 14:14:59.116026  4752 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at C:\Users\YangWang/.cache\torch\transformers\a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c
I0716 14:14:59.118988  4752 configuration_utils.py:321] Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

I0716 14:15:00.081022  4752 token

In [2]:
train = joblib.load("../data/train.bin")
valid = joblib.load("../data/valid.bin")

In [3]:
train.head()

Unnamed: 0_level_0,Top 1 News,Top 2 News,Top 3 News,Open,High,Low,Close,Volume,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-08-16,life maladroid,google dials pocketing motorola,,1.44373,1.44664,1.4354,1.43804,265049,0
2011-08-26,facebook facing music google calling tune?,,,1.43878,1.45012,1.43303,1.44985,215354,1
2011-09-08,google want zagat?,,,1.40803,1.40887,1.38733,1.38891,231356,0
2011-09-13,tech wrap: intel google launch android partner...,,,1.36381,1.37386,1.35585,1.36819,289282,1
2011-09-14,"facebook google: say circles, say smart lists","connecting facebook, friendship longer required",,1.36819,1.37816,1.35907,1.37478,281384,1


In [4]:
train.label.unique()

array([0, 1], dtype=int64)

In [5]:
class ReutersDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, top_k):
        self.df = df
        self.targets = df.label.values
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.top_k = top_k
        self.len = len(self.df)

    def __len__(self):
        return self.len

    def __getitem__(self, item):
        total_top = self.df.iloc[item, 0:self.top_k].values
        target = self.targets[item]
        
        enc_list = []
        for k in total_top:
            enc = self.tokenizer.encode_plus(
                str(k),
                max_length=self.max_len,
                add_special_tokens=True,
                return_token_type_ids=False,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors="pt")
            enc["input_ids"] = enc["input_ids"].flatten()
            enc["attention_mask"] = enc["attention_mask"].flatten()
            enc_list.append(enc)


        return {
            "ids_and_mask": enc_list,
            "target": torch.tensor(target)
        }

In [6]:
def create_dataloader(df, tokenizer, max_len, top_k, batch_size):
    dataset = ReutersDataset(
        df=df,
        tokenizer=tokenizer,
        max_len=max_len, 
        top_k=top_k)

    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0)

In [7]:
class ReutersClassifier(nn.Module):

    def __init__(self, n_classes, top_k, p=0.25):
        super(ReutersClassifier, self).__init__()
        self.PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
        self.distilbert_layer = AutoModel.from_pretrained(self.PRE_TRAINED_MODEL_NAME)
        self.dropout = nn.Dropout(p=p)
        self.classifier = nn.Linear(self.distilbert_layer.config.dim*top_k, n_classes)

    def forward(self, ids_and_mask):
        pool_list = []
        for enc in ids_and_mask:
            pooled_output = self.distilbert_layer(
                input_ids=enc["input_ids"],
                attention_mask=enc["attention_mask"])
            branch = self.dropout(pooled_output[0][:, 0, :])
            pool_list.append(branch)
        main = torch.cat([br for br in pool_list], 1)
        return F.softmax(self.classifier(main))

    def freeze_bert_encoder(self):
        for param in self.distilbert_layer.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.distilbert_layer.parameters():
            param.requires_grad = True

In [8]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
model = ReutersClassifier(n_classes=2, top_k=3)
model.to(device)

I0716 14:15:02.702307  4752 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at C:\Users\YangWang/.cache\torch\transformers\a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c
I0716 14:15:02.704278  4752 configuration_utils.py:321] Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

I0716 14:15:03.589309  4752 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface

ReutersClassifier(
  (distilbert_layer): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [9]:
# data_loader = create_dataloader(train, tokenizer, max_len=32, top_k=3, batch_size=8)
# for data in data_loader:
#     for d in data["ids_and_mask"]:
#         d["input_ids"] = d["input_ids"].to(device)
#         d["attention_mask"] = d["attention_mask"].to(device)
#     targets = data["target"].to(device)

#     with torch.no_grad():
#         outputs = model(data["ids_and_mask"])
#     print(outputs)

In [10]:
def matthews_correlation_coefficient(true_pos, true_neg, false_pos, false_neg):
    nominator = (true_pos*true_neg-false_pos*false_neg)
    denominator = np.sqrt((true_pos+false_pos)*(true_pos+false_neg)*(true_neg+false_pos)*(true_neg+false_neg)) + 1e-7
    return (nominator / denominator)

def train_distilbert(model, data_loader, loss_function, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0
    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0

    for data in data_loader:
        for d in data["ids_and_mask"]:
            d["input_ids"] = d["input_ids"].to(device)
            d["attention_mask"] = d["attention_mask"].to(device)
        targets = data["target"].to(device)

        outputs = model(data["ids_and_mask"])
        _, preds = torch.max(outputs, dim=1)
        loss = loss_function(outputs, targets)

        for p, t in zip(preds, targets):
            if p == 1 and t == 1:
                true_pos += 1
            if p == 0 and t == 0:
                true_neg += 1
            if p == 1 and t == 0:
                false_pos += 1
            if p == 0 and t == 1:
                false_neg += 1

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        for d in data["ids_and_mask"]:
            d["input_ids"] = d["input_ids"].to("cpu")
            d["attention_mask"] = d["attention_mask"].to("cpu")
        targets = data["target"].to("cpu")

    recall = float(true_pos) / float(true_pos + false_neg + 1e-7)
    precision = float(true_pos) / float(true_pos + false_pos + 1e-7)
    f1 = 2 * precision * recall / (precision + recall + 1e-7)
    accuracy = (true_pos + true_neg) / float(n_examples)
    mcc = matthews_correlation_coefficient(true_pos, true_neg, false_pos, false_neg)

    return accuracy, f1, mcc, np.mean(losses)


def eval_distilbert(model, data_loader, loss_function, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0
    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0

    with torch.no_grad():
        for data in data_loader:
            for d in data["ids_and_mask"]:
                d["input_ids"] = d["input_ids"].to(device)
                d["attention_mask"] = d["attention_mask"].to(device)
            targets = data["target"].to(device)

            outputs = model(data["ids_and_mask"])
            _, preds = torch.max(outputs, dim=1)
            loss = loss_function(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            for p, t in zip(preds, targets):
                if p == 1 and t == 1:
                    true_pos += 1
                if p == 0 and t == 0:
                    true_neg += 1
                if p == 1 and t == 0:
                    false_pos += 1
                if p == 0 and t == 1:
                    false_neg += 1
                    
            for d in data["ids_and_mask"]:
                d["input_ids"] = d["input_ids"].to("cpu")
                d["attention_mask"] = d["attention_mask"].to("cpu")
            targets = data["target"].to("cpu")

    recall = float(true_pos) / float(true_pos + false_neg + 1e-7)
    precision = float(true_pos) / float(true_pos + false_pos + 1e-7)
    f1 = 2 * precision * recall / (precision + recall + 1e-7)
    accuracy = (true_pos + true_neg) / float(n_examples)
    mcc = matthews_correlation_coefficient(true_pos, true_neg, false_pos, false_neg)

    return accuracy, f1, mcc, np.mean(losses)

In [11]:
train_dataloader = create_dataloader(train, tokenizer, max_len=32, top_k=3, batch_size=8)
valid_dataloader = create_dataloader(valid, tokenizer, max_len=32, top_k=3, batch_size=8)

EPOCH = 2
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.417, correct_bias=False)
total_steps = len(train_dataloader) * EPOCH
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)
loss_function = nn.CrossEntropyLoss().to(device)

history = defaultdict(list)
best_f1 = 0

for epoch in range(EPOCH):
    print("=" * 20)
    print("Epoch {}/{}".format(epoch + 1, EPOCH))

    train_acc, train_f1, train_mcc, train_loss = train_distilbert(
        model, train_dataloader, loss_function, optimizer, device, scheduler, len(train))

    print("Train | Loss: {:.4f} | Accuracy: {:.4f} | F1: {:.4f} | MCC: {:.4f}".format(
        train_loss, train_acc, train_f1, train_mcc))

    val_acc, val_f1, val_mcc, val_loss = eval_distilbert(
        model, valid_dataloader, loss_function, device, len(valid))

    print("Valid | Loss: {:.4f} | Accuracy: {:.4f} | F1: {:.4f} | MCC: {:.4f}".format(
        val_loss, val_acc, val_f1, val_mcc))

    history["train_f1"].append(train_f1)
    history["train_mcc"].append(train_mcc)
    history["train_acc"].append(train_acc)
    history["train_loss"].append(train_loss)
    history["val_f1"].append(val_f1)
    history["val_mcc"].append(val_mcc)
    history["val_acc"].append(val_acc)
    history["val_loss"].append(val_loss)

    if val_f1 > best_f1:
        torch.save(model.state_dict(), "../weights/distilbert_test.bin")
        best_f1 = val_f1

Epoch 1/2
Train | Loss: 0.7025 | Accuracy: 0.4963 | F1: 0.5098 | MCC: -0.0063
Valid | Loss: 0.6968 | Accuracy: 0.4967 | F1: 0.0000 | MCC: 0.0000
Epoch 2/2
Train | Loss: 0.6940 | Accuracy: 0.5207 | F1: 0.3808 | MCC: 0.0383
Valid | Loss: 0.6929 | Accuracy: 0.4915 | F1: 0.0251 | MCC: -0.0400


In [18]:
import yfinance as yf
import pandas as pd

def load_stock(ticker_name, start_date):
    ticker = yf.Ticker(ticker_name)
    hist = ticker.history(period="max", start=start_date)
    hist.index = hist.index.set_names(['date'])
    hist = hist.reset_index(drop=False, inplace=False)
    hist["date"] = pd.to_datetime(hist["date"], utc=True)
    hist['date'] = hist['date'].apply(lambda x: x.date())
    hist.sort_values(by='date', inplace=True)
    hist.reset_index(drop=True, inplace=True)
    hist["label"] = hist["Close"].diff(periods=1)
    hist.dropna(inplace=True)
    hist["label"] = hist["label"].map(lambda x: 1 if float(x) >= 0 else 0)
    return hist

In [19]:
hist = load_stock("MSFT", start_date="2012-01-01")
hist

Unnamed: 0,date,Open,High,Low,Close,Volume,Dividends,Stock Splits,label
1,2012-01-04,22.00,22.53,21.97,22.48,80516100,0.0,0,1
2,2012-01-05,22.46,22.75,22.39,22.71,56081400,0.0,0,1
3,2012-01-06,22.58,23.12,22.58,23.06,99455500,0.0,0,1
4,2012-01-09,23.01,23.05,22.74,22.76,59706800,0.0,0,0
5,2012-01-10,22.91,23.09,22.76,22.84,60014400,0.0,0,1
...,...,...,...,...,...,...,...,...,...
2142,2020-07-09,216.33,216.38,211.47,214.32,33121700,0.0,0,1
2143,2020-07-10,213.62,214.08,211.08,213.67,26177600,0.0,0,0
2144,2020-07-13,214.48,215.80,206.50,207.07,38135600,0.0,0,0
2145,2020-07-14,206.13,208.85,202.03,208.35,37591800,0.0,0,1


In [21]:
sp500_file = "../data/sp500tickers.pkl"
with open(sp500_file, 'rb') as f:
    tickers = pickle.load(f)
tickers

['MMM',
 'ABT',
 'ABBV',
 'ABMD',
 'ACN',
 'ATVI',
 'ADBE',
 'AMD',
 'AAP',
 'AES',
 'AFL',
 'A',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'ARE',
 'ALXN',
 'ALGN',
 'ALLE',
 'ADS',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'ABC',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'ANTM',
 'AON',
 'AOS',
 'APA',
 'AIV',
 'AAPL',
 'AMAT',
 'APTV',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'BKR',
 'BLL',
 'BAC',
 'BK',
 'BAX',
 'BDX',
 'BRK.B',
 'BBY',
 'BIIB',
 'BLK',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BF.B',
 'CHRW',
 'COG',
 'CDNS',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'CNC',
 'CNP',
 'CTL',
 'CERN',
 'CF',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CTXS',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA',
 'CAG',
 'CXO'