In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torchsummary import summary
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from tensorboardX import SummaryWriter
from IPython.display import clear_output


In [None]:
BERT_PATH = 'bert-base-uncased'
MAX_LEN = 64
DEVICE = "cuda"
BZ = 128
WD = 1e-9
LR = 1e-5
EPS = 50
TMAX = 15
COMMENT = f"lr{LR}-B{BZ}-EPS{EPS}-TOU{TMAX}"

In [None]:
merge = pd.read_csv("../data/tmp.csv")
# , index_col=0
merge = shuffle(merge)
ds_size = merge.shape[0]

In [None]:
print(merge)

In [None]:
class Stock(Dataset):
    def __init__(self, df) -> None:
        self.df = df
        self.score = [float(score) for score in df["score"]]
        self.tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
        self.content = [self.tokenizer(text,padding='max_length', 
                       max_length = MAX_LEN, 
                       truncation=True,
                       return_tensors="pt") for text in df["content"]]

    def __getitem__(self, idx):
        return self.score[idx], self.content[idx]

    def __len__(self):
        return self.df.shape[0]

In [None]:
t0, t1, t2 = np.split(merge.sample(frac=1, random_state=42), [int(.8*ds_size), int(.9*ds_size)])
dataset = {x: Stock(s) for x, s in [("train", t0), ("valid", t1), ("test", t2)]}
loader = {x: DataLoader(dataset[x], batch_size=BZ, num_workers=24, shuffle=True) 
                                            for x in ["train", "valid", "test"]}

In [None]:
for state in ["train", "valid", "test"]:
    sz = 0
    for num, t in loader[state]:
        sz += len(num)
    print(sz)

In [None]:
class Bert4price(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.fc = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    def forward(self, text, mask):
        _, output = self.bert(input_ids=text, attention_mask=mask,return_dict=False)
        output = self.fc(output)
        return torch.squeeze(output, 1).double()

In [None]:
bert = Bert4price()
bert = bert.to(DEVICE)

In [None]:
crierion = nn.MSELoss()
optimizer = torch.optim.Adam(bert.parameters(), lr = LR, weight_decay=WD)
lr_sch = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TMAX)

In [None]:
writer = SummaryWriter(comment=COMMENT)
min_loss = 1e10
for epoch in range(EPS):
    for state in ["train", "valid"]:
        clear_output(wait=True)
        tqdm_bar = tqdm(loader[state])
        tqdm_bar.set_description(f"[{epoch+1}/{EPS}]")
        loss_list = []
        for value, content in tqdm_bar:
            
            text, mask = content["input_ids"].squeeze(1), content["attention_mask"]
            text, mask = text.to(DEVICE), mask.to(DEVICE)
        
            value = value.to(DEVICE)
            output = bert(text, mask)
            loss = crierion(output, value)
            loss_list.append(loss.item())
            if state == "train":
                optimizer.zero_grad() 
                loss.backward()
                optimizer.step()
                
        avg_loss = sum(loss_list) / (len(loss_list)*BZ)
        if avg_loss < min_loss:
            min_loss = avg_loss
            torch.save(bert.state_dict(), f"./pretrained/{COMMENT}.pt")
        writer.add_scalar(f"{state}-loss", avg_loss, epoch)
    else:
        lr_sch.step()
        vnum = value.detach().cpu().numpy()
        onum = output.detach().cpu().numpy()
        result = np.array([vnum, onum]).T
        valid_result = pd.DataFrame(result, columns = ['ans','output'])
        valid_result.to_csv("../data/result.csv")