In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torchsummary import summary
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from tensorboardX import SummaryWriter
from IPython.display import clear_output


In [8]:
BERT_PATH = 'bert-base-uncased'
MAX_LEN = 64
DEVICE = "cuda"
BZ = 128
WD = 1e-9
LR = 1e-5
EPS = 50
TMAX = 15
COMMENT = f"lr{LR}-B{BZ}-EPS{EPS}-TOU{TMAX}"

In [14]:
merge = pd.read_csv("../data/tmp.csv")
# , index_col=0
merge = shuffle(merge)
ds_size = merge.shape[0]

In [15]:
print(merge)

      symbol   datentime                                            content  \
16192   AMZN  2021-11-03  Lannebo Fonder AB Buys Upland Software Inc, Se...   
4971    AMZN  2022-07-24  ARGI Investment Services LLC Purchases 107 Sha...   
24899   AMZN  2021-03-16  UAVS ALERT: Kessler Topaz Meltzer & Check, LLP...   
13349   NFLX  2022-01-11  Blackstone-backed Candle Media acquires Farawa...   
14239   AMZN  2021-12-18  Amazon collaborated with China propaganda arm:...   
...      ...         ...                                                ...   
14142   AAPL  2021-12-20                         SPY Stock: The S&P 500 ETF   
17680   AAPL  2021-10-07  Discovery : BOOM TIME FOR MOONSHINE! ALL-NEW S...   
8147    NFLX  2022-05-14  Top 5 1st Quarter Trades of Winslow Capital Ma...   
10347   NFLX  2022-03-15  Euronet Stock Offers Undervalued Leverage To N...   
833     META  2022-10-19  Silvergate Reports a Fall After Its Delayed St...   

            match  sentiment      score  preprice  

In [10]:
class Stock(Dataset):
    def __init__(self, df) -> None:
        self.df = df
        self.score = [float(score) for score in df["score"]]
        self.tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
        self.content = [self.tokenizer(text,padding='max_length', 
                       max_length = MAX_LEN, 
                       truncation=True,
                       return_tensors="pt") for text in df["content"]]

    def __getitem__(self, idx):
        return self.score[idx], self.content[idx]

    def __len__(self):
        return self.df.shape[0]

In [11]:
t0, t1, t2 = np.split(merge.sample(frac=1, random_state=42), [int(.8*ds_size), int(.9*ds_size)])
dataset = {x: Stock(s) for x, s in [("train", t0), ("valid", t1), ("test", t2)]}
loader = {x: DataLoader(dataset[x], batch_size=BZ, num_workers=24, shuffle=True) 
                                            for x in ["train", "valid", "test"]}

Downloading: 100%|██████████| 232k/232k [00:00<00:00, 293kB/s]  
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 7.25kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 214kB/s]


In [12]:
for state in ["train", "valid", "test"]:
    sz = 0
    for num, t in loader[state]:
        sz += len(num)
    print(sz)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/lucytuan/anaconda3/envs/IDS/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/lucytuan/anaconda3/envs/IDS/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'Stock' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [7]:
class Bert4price(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.fc = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    def forward(self, text, mask):
        _, output = self.bert(input_ids=text, attention_mask=mask,return_dict=False)
        output = self.fc(output)
        return torch.squeeze(output, 1).double()

In [8]:
bert = Bert4price()
bert = bert.to(DEVICE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
crierion = nn.MSELoss()
optimizer = torch.optim.Adam(bert.parameters(), lr = LR, weight_decay=WD)
lr_sch = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TMAX)

In [12]:
writer = SummaryWriter(comment=COMMENT)
min_loss = 1e10
for epoch in range(EPS):
    for state in ["train", "valid"]:
        clear_output(wait=True)
        tqdm_bar = tqdm(loader[state])
        tqdm_bar.set_description(f"[{epoch+1}/{EPS}]")
        loss_list = []
        for value, content in tqdm_bar:
            
            text, mask = content["input_ids"].squeeze(1), content["attention_mask"]
            text, mask = text.to(DEVICE), mask.to(DEVICE)
        
            value = value.to(DEVICE)
            output = bert(text, mask)
            loss = crierion(output, value)
            loss_list.append(loss.item())
            if state == "train":
                optimizer.zero_grad() 
                loss.backward()
                optimizer.step()
                
        avg_loss = sum(loss_list) / (len(loss_list)*BZ)
        if avg_loss < min_loss:
            min_loss = avg_loss
            torch.save(bert.state_dict(), f"./pretrained/{COMMENT}.pt")
        writer.add_scalar(f"{state}-loss", avg_loss, epoch)
    else:
        lr_sch.step()
        vnum = value.detach().cpu().numpy()
        onum = output.detach().cpu().numpy()
        result = np.array([vnum, onum]).T
        valid_result = pd.DataFrame(result, columns = ['ans','output'])
        valid_result.to_csv("../data/result.csv")

[100/100]: 100%|██████████| 20/20 [00:02<00:00,  8.13it/s]
