In [2]:
import torch

In [71]:
import torch.nn as nn

In [136]:
from torch.utils.data import Dataset,DataLoader,random_split

In [142]:
GPU = "cuda" if torch.cuda.is_available() else "cpu"

In [54]:
import random
import os
from tqdm import tqdm
from typing import *
from collections import defaultdict
import math
import string
from nltk.stem import PorterStemmer

import numpy as np

from predeal_dataset import *

In [204]:
EMB_DIM = 50

EPOCH = 100
BATCH_SIZE = 10 
LR = 0.002
L2 = 0.0001


In [145]:
def log_text_to_file(text:str)->None:
    with open("./output/log.txt","a") as fout:
        fout.write(text+"\n")

In [3]:
def load_GLoVe_embedding(dim:int=50)->Dict[str,List[float]]:
    res = dict()
    error_cnt = 0
    with open(f"./glove.6B/glove.6B.{dim}d.txt",encoding="utf-8") as fin:
        lines = fin.readlines()
        for line in tqdm(lines):
            elements = line.split()
            if len(elements)!=dim+1:
                error_cnt+=1
                continue
            word = elements[0]
            vector = list(map(float,elements[1:]))
            res[word]=vector
        print(f"{error_cnt} lines are discarded")
    return res

glove_emb_50d = load_GLoVe_embedding(EMB_DIM)

100%|██████████| 400000/400000 [00:10<00:00, 38963.76it/s]


0 lines are discarded


In [40]:
puncts = string.punctuation
stemmer = PorterStemmer()
def tokenize(sentence:str,use_stemmer:bool=True)->List[str]:
    # sentence = sentence.replace("(","( ").replace("[","[ ").replace("{","{ ")
    for p in puncts:
        sentence = sentence.replace(p,p+" ")
    res = list()
    tmp = sentence.split()
    for word in tmp:
        if len(word)==0:
            continue
        if word=="...":
            res.append(word)
            continue        
        if word[-1] in puncts:
            p = word[-1]
            word = word[:-1]
            if len(word)>0:
                word = word.lower()
                if use_stemmer:
                    word = stemmer.stem(word)
                res.append(word)
            res.append(p)
        else:
            word = word.lower()
            if use_stemmer:
                word = stemmer.stem(word)
            res.append(word)
    return res

In [36]:
spoiler_dataset = sample_sub_spoiler_set(SUBSET_SENTENCE_CNT)

In [41]:
word_cnt = defaultdict(int)

for datum in tqdm(spoiler_dataset):
    sentence = datum['review_sentence']
    for word in tokenize(sentence,False):
        word_cnt[word]+=1

100%|██████████| 50000/50000 [00:05<00:00, 8509.67it/s]


In [50]:
DICTIONARY_SIZE = 5000
SEPERATOR_SIGN = "seperatorsign"
UNKNOWN_WORD = "unknownword"

word_with_freq = list(word_cnt.items())
word_with_freq.sort(key=lambda tup:tup[1],reverse=True)
dictionary = word_with_freq[:]
dictionary = list(map(lambda tup:tup[0],dictionary))
dictionary = list(filter(lambda word:word in glove_emb_50d,dictionary))
dictionary = dictionary[:DICTIONARY_SIZE-2]
dictionary.append(SEPERATOR_SIGN)
dictionary.append(UNKNOWN_WORD)

word2id = {word:i for i,word in enumerate(dictionary)}

In [51]:
embedding = list(glove_emb_50d[word] for word in dictionary[:-2])

In [57]:
def norm(vector:List[float])->float:
    sq_sum = 0.0
    for x in vector:
        sq_sum+=x**2
    return sq_sum**0.5

emb_avg_norm = np.average([norm(v) for v in embedding])

In [63]:
def gen_random_emb():    
    res = np.random.randn(EMB_DIM)
    return (emb_avg_norm/norm(res))*res

In [67]:
embedding.append(gen_random_emb())
embedding.append(gen_random_emb())

In [69]:
embedding_tensor = torch.FloatTensor(embedding)

  embedding_tensor = torch.FloatTensor(embedding)


In [74]:
unk_id = word2id[UNKNOWN_WORD]

def parse_id_sequence(sentence):
    res = list()
    words = tokenize(sentence,False)
    for word in words:
        if word not in word2id:
            res.append(unk_id)
        else:
            res.append(word2id[word])
    return res

In [206]:
class BiLSTMClassifierWithPretrainedEmbedding(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim:int=2*EMB_DIM, output_dim:int=2, n_layers:int=2):
        super().__init__()
        embedding_dim = embedding_matrix.size(1)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                            bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2+1, output_dim)
        

    def forward(self, text):
        embedded = self.embedding(text)
        l = len(embedded)
        l_tensor = torch.FloatTensor([l]).to(GPU)
        outputs, (hidden, cell) = self.lstm(embedded)
        outputs = outputs[-1,:]
        outputs_with_l = torch.concat((outputs,l_tensor))
        dense_outputs = self.fc(outputs_with_l)
        return dense_outputs

In [144]:
model = BiLSTMClassifierWithPretrainedEmbedding(embedding_tensor)

In [154]:
model.to("cpu")
input_ids = parse_id_sequence("can can need new new")
input_tensor = torch.LongTensor(input_ids)
input_tensor = input_tensor.reshape((-1,1))
loss_func = nn.CrossEntropyLoss()
output = model(input_tensor)
print(output)
loss_func(output,torch.LongTensor([1])[0])

tensor([ 0.0126, -0.0409], grad_fn=<AddBackward0>)


tensor(0.7203, grad_fn=<NllLossBackward0>)

In [138]:
class ListDataset(Dataset):
    def __init__(self,*lists) -> None:
        super().__init__()
        if len(lists)==0:
            raise ValueError("Expecting at least one list")
        l = len(lists[0])
        for i,li in enumerate(lists):
            if not isinstance(li,(list,tuple,np.ndarray,torch.Tensor)):
                raise ValueError(f"expecting input to be list,tuple,numpy-array or torch's tensor, actually get {type(li)} at {i}-th argument")
            if len(li)!=l:
                raise ValueError(f"length of {i}-th argument is {len(li)}, length of 0-th argument is {l}, they don't match")
        self.lists = lists
        self.l = l
    
    def __len__(self):
        return self.l
    
    def __getitem__(self, index) -> Any:
        return tuple(map(lambda l:l[index],self.lists))

def get_spoiler_dataset(spoiler_dataset_raw:List[dict])->Dataset:
    xs = list()
    ys = list()
    for datum in spoiler_dataset_raw:
        xs.append(datum['review_sentence'])
        ys.append(datum['label'])
    return ListDataset(xs,ys)

spoiler_dataset_processed = get_spoiler_dataset(spoiler_dataset)

In [140]:
train_dataset, valid_dataset, test_dataset = random_split(spoiler_dataset_processed, [TRAIN_SET_CNT, VALID_SET_CNT, TEST_SET_CNT])

In [198]:
def get_performance_info(y_actual,y_predict):
    y_actual = np.array(y_actual)
    y_predict = np.array(y_predict)
    y_actual = y_actual.reshape((-1,))
    y_predict = y_predict.reshape((-1,))
    TP = np.sum((y_actual == 1) & (y_predict == 1))
    FP = np.sum((y_actual == 0) & (y_predict == 1))
    TN = np.sum((y_actual == 0) & (y_predict == 0))
    FN = np.sum((y_actual == 1) & (y_predict == 0))
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP)
    FNR = FN / (TP + FN)
    BER = 1 - (0.5 * (TPR + TNR))
    accu = np.sum(y_actual==y_predict)/len(y_actual)
    return accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER

e = 2.718281828

def get_best_ths_with_ber(pred_prop_with_label): 
    pred_prop_with_label.sort(reverse=True)
    valid_set_pos_cnt = sum(tup[1] for tup in pred_prop_with_label)
    valid_set_neg_cnt = len(pred_prop_with_label)-valid_set_pos_cnt
    best_ths = 1.0
    best_ber = 0.5
    curr_false_positive = 0
    curr_false_negative = valid_set_pos_cnt
    for (prob,label) in pred_prop_with_label:
        ths = prob-0.00001
        if label==1:
            curr_false_negative-=1
        else:
            curr_false_positive+=1
        ber = 0.5*(curr_false_negative/valid_set_pos_cnt+curr_false_positive/valid_set_neg_cnt)
        if ber<best_ber:
            best_ber = ber
            best_ths = ths
    return best_ths,best_ber

def evaluate_dynamic_prob_ths(model,dataset):
    model.to(GPU)
    e = 2.718281828
    y_pred_logits = list()
    y_label = list()
    with torch.no_grad():
        for b_x,b_y in tqdm(dataset):
            input_ids = torch.LongTensor(parse_id_sequence(b_x)).to(GPU)
            output = model(input_ids).to("cpu").detach()
            y_label.append(b_y)
            y_pred_logits.append(output.numpy().tolist())
    y_pos_prob_pred = list(map(lambda logits:e**logits[1]/(e**logits[0]+e**logits[1]),y_pred_logits))
    pred_prob_with_label = list(zip(y_pos_prob_pred,y_label))
    judging_ths,ber = get_best_ths_with_ber(pred_prob_with_label)
    y_pred = list(int(p>judging_ths) for p in y_pos_prob_pred)
    return *get_performance_info(y_label,y_pred),judging_ths


In [172]:
model = BiLSTMClassifierWithPretrainedEmbedding(embedding_tensor)

# model.fc

evaluate_dynamic_prob_ths(model,valid_dataset)

  0%|          | 0/5000 [00:00<?, ?it/s]

100%|██████████| 5000/5000 [00:19<00:00, 261.96it/s]


(0.8102,
 51,
 696,
 4000,
 253,
 0.16776315789473684,
 0.14821124361158433,
 0.8517887563884157,
 0.8322368421052632,
 0.4902240428584237,
 0.49586474531550173)

In [181]:
class ShuffledDataset(Dataset):
    def __init__(self,dataset) -> None:
        super().__init__()
        self.dataset = dataset
        indices = list(range(len(dataset)))
        random.shuffle(indices)
        self.indices = indices

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index) -> Any:
        return self.dataset[self.indices[index]]

In [208]:
model = BiLSTMClassifierWithPretrainedEmbedding(embedding_tensor)
model.to(GPU)

pos_cnt = sum(y for (x,y) in train_dataset)
neg_cnt = len(train_dataset)-pos_cnt
weights = [pos_cnt/len(train_dataset),neg_cnt/len(train_dataset)]
weights = torch.tensor(weights, dtype=torch.float).to(GPU)
loss_func = torch.nn.CrossEntropyLoss(weights)
weight_params = [param for name, param in model.named_parameters() if 'weight' in name]
bias_params = [param for name, param in model.named_parameters() if 'bias' in name]
optimizer = torch.optim.Adam([
    {'params': weight_params, 'weight_decay': L2},
    {'params': bias_params, 'weight_decay': 0.0}  
], lr=LR/BATCH_SIZE) 
# optimizer = torch.optim.Adam(model.parameters(),LR/BATCH_SIZE,weight_decay=L2)

# data_loader = DataLoader(train_dataset,batch_size=1,shuffle=True)
log_text_to_file("start training bilstm model , dataset is review_text only")
for e in range(EPOCH):
    for step,(b_x,b_y) in enumerate(tqdm(ShuffledDataset(train_dataset))):
        input_ids = torch.LongTensor(parse_id_sequence(b_x)).to(GPU)
        output = model(input_ids)
        b_y = torch.LongTensor([b_y])[0].to(GPU)
        loss = loss_func(output,b_y)
        if (step+1)%BATCH_SIZE==0:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER,ths = evaluate_dynamic_prob_ths(model,valid_dataset)
    print(f"ber after epoch {e+1}: {BER}")
    msg = "    %7s%7s%7s%7s%7s%7s%7s\n     %.4f %.4f %.4f %.4f %.4f %.4f %.4f"%("accu","ber","tpr","fpr","tnr","fnr","ths",accu,BER,TPR,FPR,TNR,FNR,ths)
    print(msg)
    save_path = f"./output/review_only_lstm_e{e}_b{BATCH_SIZE}_lr{LR}_l2{L2}_ber{BER:.4f}"
    torch.save(model.state_dict(),save_path)
    log_text_to_file(f"    ber after epoch {e+1}: {BER}, model params saved to {save_path}, other performance infos:{accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER}")
    log_text_to_file(msg)

100%|██████████| 40000/40000 [03:02<00:00, 219.35it/s]
100%|██████████| 5000/5000 [00:19<00:00, 257.01it/s]


ber after epoch 1: 0.36216152604680363
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6260 0.3622 0.6513 0.3756 0.6244 0.3487 0.0438


100%|██████████| 40000/40000 [03:03<00:00, 217.54it/s]
100%|██████████| 5000/5000 [00:19<00:00, 256.75it/s]


ber after epoch 2: 0.3543161032905945
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.5136 0.3543 0.7961 0.5047 0.4953 0.2039 0.0510


100%|██████████| 40000/40000 [03:03<00:00, 217.88it/s]
100%|██████████| 5000/5000 [00:19<00:00, 254.91it/s]


ber after epoch 3: 0.36173563166860934
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6268 0.3617 0.6513 0.3748 0.6252 0.3487 0.0634


100%|██████████| 40000/40000 [03:01<00:00, 220.54it/s]
100%|██████████| 5000/5000 [00:19<00:00, 258.89it/s]


ber after epoch 4: 0.36184210526315796
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6266 0.3618 0.6513 0.3750 0.6250 0.3487 0.0377


100%|██████████| 40000/40000 [03:01<00:00, 220.15it/s]
100%|██████████| 5000/5000 [00:19<00:00, 257.25it/s]


ber after epoch 5: 0.36198500403478884
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.5252 0.3620 0.7664 0.4904 0.5096 0.2336 0.0269


100%|██████████| 40000/40000 [03:01<00:00, 220.97it/s]
100%|██████████| 5000/5000 [00:19<00:00, 259.34it/s]


ber after epoch 6: 0.36129852954362063
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6334 0.3613 0.6447 0.3673 0.6327 0.3553 0.0476


100%|██████████| 40000/40000 [02:58<00:00, 224.58it/s]
100%|██████████| 5000/5000 [00:19<00:00, 251.77it/s]


ber after epoch 7: 0.360020846409038
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6358 0.3600 0.6447 0.3648 0.6352 0.3553 0.0545


100%|██████████| 40000/40000 [02:57<00:00, 224.90it/s]
100%|██████████| 5000/5000 [00:19<00:00, 260.09it/s]


ber after epoch 8: 0.359973213485161
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6330 0.3600 0.6480 0.3680 0.6320 0.3520 0.0712


100%|██████████| 40000/40000 [03:03<00:00, 217.48it/s]
100%|██████████| 5000/5000 [00:19<00:00, 251.64it/s]


ber after epoch 9: 0.3594408455124182
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6340 0.3594 0.6480 0.3669 0.6331 0.3520 0.0424


100%|██████████| 40000/40000 [03:01<00:00, 220.15it/s]
100%|██████████| 5000/5000 [00:19<00:00, 256.38it/s]


ber after epoch 10: 0.360233793598135
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6354 0.3602 0.6447 0.3652 0.6348 0.3553 0.0628


100%|██████████| 40000/40000 [03:01<00:00, 220.86it/s]
100%|██████████| 5000/5000 [00:19<00:00, 256.06it/s]


ber after epoch 11: 0.3606120550524523
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6318 0.3606 0.6480 0.3693 0.6307 0.3520 0.1554


100%|██████████| 40000/40000 [03:01<00:00, 220.90it/s]
100%|██████████| 5000/5000 [00:19<00:00, 254.62it/s]


ber after epoch 12: 0.359227898323321
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6344 0.3592 0.6480 0.3665 0.6335 0.3520 0.0350


100%|██████████| 40000/40000 [03:00<00:00, 222.19it/s]
100%|██████████| 5000/5000 [00:18<00:00, 263.33it/s]


ber after epoch 13: 0.36034026719268364
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6352 0.3603 0.6447 0.3654 0.6346 0.3553 0.0956


100%|██████████| 40000/40000 [02:56<00:00, 227.24it/s]
100%|██████████| 5000/5000 [00:19<00:00, 262.58it/s]


ber after epoch 14: 0.3607185286470008
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6316 0.3607 0.6480 0.3695 0.6305 0.3520 0.0597


100%|██████████| 40000/40000 [02:55<00:00, 227.67it/s]
100%|██████████| 5000/5000 [00:18<00:00, 264.22it/s]


ber after epoch 15: 0.3607661615708777
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6344 0.3608 0.6447 0.3663 0.6337 0.3553 0.1100


100%|██████████| 40000/40000 [02:55<00:00, 227.67it/s]
100%|██████████| 5000/5000 [00:19<00:00, 261.43it/s]


ber after epoch 16: 0.3596425849547207
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6394 0.3596 0.6414 0.3607 0.6393 0.3586 0.0731


100%|██████████| 40000/40000 [02:56<00:00, 227.05it/s]
100%|██████████| 5000/5000 [00:19<00:00, 262.48it/s]


ber after epoch 17: 0.35843214830090553
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.4972 0.3584 0.8059 0.5228 0.4772 0.1941 0.0440


100%|██████████| 40000/40000 [02:55<00:00, 227.81it/s]
100%|██████████| 5000/5000 [00:19<00:00, 262.20it/s]


ber after epoch 18: 0.359743454675872
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.4774 0.3597 0.8257 0.5451 0.4549 0.1743 0.0582


100%|██████████| 40000/40000 [02:56<00:00, 226.42it/s]
100%|██████████| 5000/5000 [00:19<00:00, 262.16it/s]


ber after epoch 19: 0.3600796870797095
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6328 0.3601 0.6480 0.3682 0.6318 0.3520 0.0870


100%|██████████| 40000/40000 [02:56<00:00, 226.05it/s]
100%|██████████| 5000/5000 [00:19<00:00, 259.99it/s]


ber after epoch 20: 0.3605728279386712
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.4874 0.3606 0.8125 0.5336 0.4664 0.1875 0.0375


100%|██████████| 40000/40000 [02:56<00:00, 227.08it/s]
100%|██████████| 5000/5000 [00:19<00:00, 262.48it/s]


ber after epoch 21: 0.35529678113512064
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.5002 0.3553 0.8092 0.5198 0.4802 0.1908 0.0352


100%|██████████| 40000/40000 [02:57<00:00, 225.50it/s]
100%|██████████| 5000/5000 [00:19<00:00, 261.80it/s]


ber after epoch 22: 0.35957814041065184
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.4806 0.3596 0.8224 0.5415 0.4585 0.1776 0.0317


100%|██████████| 40000/40000 [02:56<00:00, 226.78it/s]
100%|██████████| 5000/5000 [00:19<00:00, 262.30it/s]


ber after epoch 23: 0.35747108401327
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.5828 0.3575 0.7105 0.4255 0.5745 0.2895 0.0747


100%|██████████| 40000/40000 [02:56<00:00, 227.00it/s]
100%|██████████| 5000/5000 [00:18<00:00, 264.74it/s]


ber after epoch 24: 0.3474457545055142
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.5554 0.3474 0.7632 0.4580 0.5420 0.2368 0.0269


100%|██████████| 40000/40000 [02:56<00:00, 227.04it/s]
100%|██████████| 5000/5000 [00:19<00:00, 260.77it/s]


ber after epoch 25: 0.35121155742849464
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.5830 0.3512 0.7237 0.4261 0.5739 0.2763 0.0472


  6%|▌         | 2223/40000 [00:10<03:03, 205.37it/s]


KeyboardInterrupt: 