In [1]:
import sys
import time
import warnings
import logging
import config
import random
import math
import torch
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel
from torch import nn
from torch.nn import functional as F
from sklearn.metrics import (accuracy_score, roc_curve, auc)
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

In [2]:
from deep_portfolio import ReutersClassifier

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = ReutersClassifier(n_classes=2, top_k=10, p=0.1, window_size=3, out_channels=64)
model.load_state_dict(torch.load("../weights/2020-08-07_cnn_distilbert.bin"))
model.to(device)

ReutersClassifier(
  (distilbert_layer): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [3]:
from collections import Counter
test_data = joblib.load("../data/test_top10_v2.bin")
print(test_data.index.min(), test_data.index.max())
print(Counter(test_data.label))
test_data.head()

2017-01-03 2020-07-01
Counter({1: 10165, 0: 8777})


Unnamed: 0_level_0,Top 1 News,Top 2 News,Top 3 News,Top 4 News,Top 5 News,Top 6 News,Top 7 News,Top 8 News,Top 9 News,Top 10 News,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-01-04,apple confirms $1 bln investment in softbank t...,update 2-apple confirms $1 bln investment in s...,apple confirms $1 billion investment in softba...,apple pulls new york times app from itunes sto...,,,,,,,110.11,110.73,110.01,110.27,21118100.0,0.0,0.0,AAPL,0
2017-01-05,brief-apple says app store generated over $20 ...,update 1-apple is app store generated $20 bln ...,apple is app store generated $20 billion for d...,rpt-update 3-apple pulls new york times apps i...,update 3-apple pulls new york times apps in ch...,apple pulls new york times apps in china after...,india reluctant to give special tax incentives...,"apple plans first retail store in s.korea, pos...",,,110.17,111.07,110.07,110.83,22193600.0,0.0,0.0,AAPL,1
2017-01-06,brief-apple inc is ceo tim cook is total 2016 ...,canada is competition watchdog closes two-year...,,,,,,,,,110.99,112.3,110.7,112.07,31751900.0,0.0,0.0,AAPL,1
2017-01-10,china is wechat seeks slice of apple is app st...,china is wechat seeks slice of apple is app st...,tesla taps apple engineer for autopilot software,,,,,,,,112.88,113.46,112.44,113.21,24462100.0,0.0,0.0,AAPL,1
2017-01-12,u.s. appeals court revives antitrust lawsuit a...,,,,,,,,,,113.01,113.39,112.35,113.34,27086200.0,0.0,0.0,AAPL,0


In [4]:
from collections import Counter
len(dict(Counter(test_data.ticker)))

99

In [5]:
from deep_portfolio import progressbar
from deep_portfolio import create_dataloader
from transformers import AutoTokenizer


PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
test_dataloader = create_dataloader(test_data, tokenizer, 32, 10, 16, shuffle=False)
model = model.eval()
pred_list = []
with torch.no_grad():
    for data in progressbar(test_dataloader):
        for d in data["ids_and_mask"]:
            d["input_ids"] = d["input_ids"].to(device)
            d["attention_mask"] = d["attention_mask"].to(device)

        outputs = model(data["ids_and_mask"])
        outputs = F.softmax(outputs)
        pred_list.extend(outputs)

[############################################################] 100%


In [6]:
pred_list_cpu = [pred.to("cpu").tolist() for pred in pred_list]
pred_list_final = [pred[1] for pred in pred_list_cpu]
pred_list_final = [pred for pred in pred_list_final]
test_data["pred"] = pred_list_final
final_df = test_data[["ticker", "pred"]]

In [7]:
final_df

Unnamed: 0_level_0,ticker,pred
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-04,AAPL,0.545896
2017-01-05,AAPL,0.541777
2017-01-06,AAPL,0.542884
2017-01-10,AAPL,0.550854
2017-01-12,AAPL,0.546736
...,...,...
2020-03-17,NOW,0.546936
2020-04-21,NOW,0.546802
2020-04-30,NOW,0.546659
2020-05-05,NOW,0.547015


In [8]:
df_total = pd.DataFrame()
for ticker in list(set(final_df.ticker)):
    # Create an empty dataframe from 2017-01-01 to 2020-06-30 (1277*99=126423)
    df_test = final_df[final_df["ticker"] == ticker]
    index = pd.date_range(start="2017-01-01", end="2020-06-30", freq='D')
    df_test = pd.concat(
        [df_test.reindex(index, columns=["ticker"], fill_value=ticker), 
         df_test.reindex(index, columns=["pred"], method=None)], axis=1)
    fill_na = random.uniform(df_test["pred"].mean(), df_test["pred"].median())
    df_test["pred"] = df_test["pred"].fillna(fill_na)
    df_total = pd.concat([df_total, df_test], axis=0)
df_total

Unnamed: 0,ticker,pred
2017-01-01,AAPL,0.543642
2017-01-02,AAPL,0.543642
2017-01-03,AAPL,0.543642
2017-01-04,AAPL,0.545896
2017-01-05,AAPL,0.541777
...,...,...
2020-06-26,WMT,0.545994
2020-06-27,WMT,0.545994
2020-06-28,WMT,0.545994
2020-06-29,WMT,0.547059


In [9]:
joblib.dump(df_total, "../data/ticker_prediction.npy")

['../data/ticker_prediction.npy']

In [10]:
joblib.load("../data/ticker_prediction.npy")

Unnamed: 0,ticker,pred
2017-01-01,AAPL,0.543642
2017-01-02,AAPL,0.543642
2017-01-03,AAPL,0.543642
2017-01-04,AAPL,0.545896
2017-01-05,AAPL,0.541777
...,...,...
2020-06-26,WMT,0.545994
2020-06-27,WMT,0.545994
2020-06-28,WMT,0.545994
2020-06-29,WMT,0.547059
