In [1]:
import sys 
sys.path.append("./src")

In [2]:
import collections
import logging
import os
import torch
import MeCab
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, WordpieceTokenizer
from pytorch_pretrained_bert.tokenization import load_vocab
from pytorch_transformers import BertPreTrainedModel, BertModel, BertForSequenceClassification, BertConfig
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import pandas as pd
import numpy as np
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
from collections import defaultdict
import sklearn
import sklearn.metrics
import  matplotlib.pyplot as plt
import time
from sklearn.metrics import classification_report
import copy
from multiprocessing import Pool, cpu_count
import requests
import ml_metrics 
import pickle
import sys
import datetime
from dateutil.relativedelta import relativedelta
from torch import functional as F
import copy

In [3]:
from bert_ja import *
from evaluation import *

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
tokenizer = BertMeCabTokenizer.from_pretrained('./models/PyTorchPretrainendModel/vocab.txt')

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file ./models/PyTorchPretrainendModel/vocab.txt


In [5]:
vocab_list = list(tokenizer.vocab.keys())

## Load Dataset

In [6]:
train_data_size = 2200
valid_size = 316
test_size = 500
gradient_accumulation_steps = 1
max_epoch = 5
max_seq_length = 512
batch_size = 8

In [7]:
df_data = pd.read_csv("dataset/ticker_list.csv", index_col=0)
df_data_text = pd.read_csv("dataset/text_data.csv")
sector2id = pickle.load(open("dataset/sector2id_dict.pkl", "rb"))
industry2id = pickle.load(open("dataset/industry2id_dict.pkl","rb"))
id2sector = dict(zip(list(sector2id.values()), list(sector2id.keys())))
id2industry = dict(zip(list(industry2id.values()), list(industry2id.keys())))
revised_month_stock_df_dict = pickle.load(open("dataset/revised_month_stock_df_dict.pkl", "rb"))
sec_code2companyname = dict(zip(df_data["ticker"], df_data["company_nm"] ))
use_sec_code_list_rev = np.sort(df_data["ticker"])

In [8]:
use_text_data = [np.array(text_id[1:-1].split(", ")).astype(np.int32) for text_id in df_data_text["text_id"]]
use_label_data = [sector2id[label] for label in df_data["sector17"]]
use_industry_data = [industry2id[label] for label in df_data["sector33"]]

In [9]:
seed = 0
np.random.seed(seed)
random_perm_all = np.random.permutation(len(use_text_data))

In [10]:
use_sec_code_list_rev = np.sort(df_data["ticker"])

In [11]:
similarity_mat_df = pd.read_csv("stock_data/all_similarity_mat.csv",  index_col=0)
similarity_mat_df = similarity_mat_df.T[df_data["ticker"]].T[[str(val) for val in df_data["ticker"]]]

In [12]:
label_num = max(sector2id.values())+ 1
industry_num = max(industry2id.values())+ 1

In [13]:
similarity_inner_mat = np.array(similarity_mat_df)

In [14]:
def out_actual_predicted_list(sentence_representation_all, theme_word_list, pooled_output, use_sec_code_list_eval, th_val = 0):
    actual_list = []
    predicted_list = []
    for word, theme_vector_gpu in zip(theme_word_list, pooled_output):
        theme_vector = theme_vector_gpu.detach().to("cpu").numpy()
        theme_code_set = set(pd.read_csv("stock_themes/" + word + ".tsv", sep = "\t", index_col = 0)["コード"])
        if np.mean(
        [(sec_code in theme_code_set) for sec_code in use_sec_code_list_eval]) <= th_val:
            continue
        #print (word)
        doc2theme_similarity_mat = sklearn.metrics.pairwise.cosine_similarity(sentence_representation_all, theme_vector.reshape(1, -1))
        actual = np.array(range(len(use_sec_code_list_eval)))[[(sec_code in theme_code_set) for sec_code in use_sec_code_list_eval]]
        predicted = np.argsort(doc2theme_similarity_mat[:,0])[-1::-1]
        actual_list.append(list(actual))
        predicted_list.append(list(predicted))
    return actual_list, predicted_list 

In [15]:
train_data_x = np.array(use_text_data)[random_perm_all][0:train_data_size]
train_data_y = np.array(use_label_data)[random_perm_all][0:train_data_size]
train_data_industry = np.array(use_industry_data)[random_perm_all][0:train_data_size]
train_inner_stock_mat = similarity_inner_mat[random_perm_all].T[random_perm_all][:train_data_size, :train_data_size]

In [16]:
valid_data_x = np.array(use_text_data)[random_perm_all][train_data_size: train_data_size+ valid_size]
valid_data_y = np.array(use_label_data)[random_perm_all][train_data_size: train_data_size+ valid_size]
valid_data_industry = np.array(use_industry_data)[random_perm_all][train_data_size: train_data_size+ valid_size]
valid_inner_stock_mat = similarity_inner_mat[random_perm_all].T[random_perm_all][
    train_data_size: train_data_size+ valid_size, train_data_size: train_data_size+ valid_size]

In [17]:
test_data_x = np.array(use_text_data)[random_perm_all][-test_size:]
test_data_y = np.array(use_label_data)[random_perm_all][-test_size:]
test_data_industry = np.array(use_industry_data)[random_perm_all][-test_size:]
test_inner_stock_mat = similarity_inner_mat[random_perm_all].T[random_perm_all][-test_size:, -test_size:]

In [18]:
total_train_examples = len(train_data_x)
test_data_size = len(test_data_y)

In [19]:
write_file_name = "models/save"

In [20]:
bert_config = BertConfig.from_pretrained('./models/finetuned_lm/')
model_state_dict = torch.load('./models/finetuned_lm/pytorch_model.bin')
model = BertForSequenceMeanVec(bert_config, label_num, industry_num, state_dict=model_state_dict)
device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
model.to(device)

INFO:pytorch_transformers.modeling_utils:loading configuration file ./models/finetuned_lm/config.json
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 32005
}

INFO:pytorch_transformers.modeling_utils:loading configuration file ./models/PyTorchPretrainendModel/config.json
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inter

BertForSequenceMeanVec(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
          

In [21]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        #{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         #'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

num_train_optimization_steps = int(
        total_train_examples / batch_size / gradient_accumulation_steps)
warmup_steps = 0

In [22]:
n_gpu = 2
model.to(device)
if -1 != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError(
            "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [23]:
optimizer = AdamW(optimizer_grouped_parameters)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)

In [24]:
loss_fct = CrossEntropyLoss(ignore_index = -1)
loss_mse = MSELoss()

In [25]:
sentence_representation_all = evaluate_model_with_cosine_similarity(
    model, test_data_x, test_data_y, test_data_industry, test_data_size, batch_size)

sector
same:  0.6717599
other:  0.65358084
industry
same:  0.68009657
other:  0.6538535


In [26]:
doc2soc_similarity_mat = sklearn.metrics.pairwise.cosine_similarity(sentence_representation_all)
sort_values = np.argsort(doc2soc_similarity_mat, axis = 1)[:,-1::-1]
    
print ("sector")
for top_n in [1,5,10]:
    print (top_n , ":", np.array([np.array(test_data_y)[sort_values[index]][1:top_n+1 ] ==  test_data_y[index] 
                for index in range(len(test_data_y))]).mean())

print ("industry")
for top_n in [1,5,10]:
    print (top_n , ":", np.array([np.array(test_data_industry)[sort_values[index]][1:top_n+1 ] ==  test_data_industry[index] 
    for index in range(len(test_data_industry))]).mean())

sector
1 : 0.352
5 : 0.2896
10 : 0.2498
industry
1 : 0.276
5 : 0.21
10 : 0.1732


In [27]:
train_data_y_mask = np.array(copy.deepcopy(train_data_y))
train_data_industry_mask = np.array(train_data_industry)

In [28]:
#mask_volume = 1
#train_data_y_mask = np.array(copy.deepcopy(train_data_y))
#train_data_y_mask[:int(len(train_data_y_mask) * mask_volume)] = -1
#train_data_industry_mask = np.array(train_data_industry)
#train_data_industry_mask[:int(len(train_data_y_mask) * mask_volume)] = -1

In [29]:
#train_data_y_mask = np.array(copy.deepcopy(train_data_y))
#train_data_y_mask[(train_data_y_mask % 2) != 0] = -1
#train_data_industry_mask = np.array(train_data_industry)
#train_data_industry_mask[(train_data_y_mask % 2) != 0] = -1

In [30]:
#train_data_y_mask = np.array(copy.deepcopy(train_data_y))
#train_data_y_mask[(train_data_y_mask % 8) != 0] = -1
#train_data_industry_mask = np.array(train_data_industry)
#train_data_industry_mask[(train_data_y_mask % 8) != 0] = -1

In [31]:
set(train_data_y_mask)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [32]:
cos = nn.CosineSimilarity(dim=2, eps=1e-6)

In [33]:
use_sec_code_list_eval = np.array(use_sec_code_list_rev)[random_perm_all][-test_size:]
theme_word_list = [word.replace(".tsv", "") for word in os.listdir("dataset/stock_themes")]

In [34]:
theme_token_list = []
for word in theme_word_list:
    text = "[CLS] " + word +  " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    tokenized_id = tokenizer.convert_tokens_to_ids(tokenized_text)
    theme_token_list.append(tokenized_id)

max_seq_length_company = 8
input_ids_list = []
for tokenized_id in theme_token_list:
    input_array = np.zeros(max_seq_length_company, dtype=np.int)
    input_array[:min(max_seq_length_company, len(tokenized_id))] = tokenized_id[:min(max_seq_length_company, len(tokenized_id))]
    input_ids_list.append(input_array)

In [35]:
sector_token_list = []
for sector_id in np.sort(list(set(train_data_y_mask)))[1:]:
    text = "[CLS] " + id2sector[sector_id] +  " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    tokenized_id = tokenizer.convert_tokens_to_ids(tokenized_text)
    sector_token_list.append(tokenized_id)

max_seq_length_company = 8
input_sector_ids_list = []
for tokenized_id in sector_token_list:
    input_array = np.zeros(max_seq_length_company, dtype=np.int)
    input_array[:min(max_seq_length_company, len(tokenized_id))] = tokenized_id[:min(max_seq_length_company, len(tokenized_id))]
    input_sector_ids_list.append(input_array)

In [36]:
model.module.to("cuda")

BertForSequenceMeanVec(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
          

In [37]:
def train(max_epoch, write_file_name, use_loss_sector = True, use_loss_stock = True, use_sector_nm_loss = True, use_loss_industry = False):
    stock_loss_scale = 0
    max_valid_f1_score = 0
    for epoch in range(max_epoch):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        global_step = 0
        first_time = time.time()
        model.train()
        random_perm = np.random.permutation(train_data_size)
        train_data_x_rand = np.array(train_data_x[0:train_data_size])[random_perm]
        train_data_y_rand = np.array(train_data_y_mask[:train_data_size])[random_perm]
        train_data_y_rand_industry = np.array(train_data_industry_mask[0:train_data_size])[random_perm]
        stock_similarity_random = train_inner_stock_mat[random_perm].T[random_perm]
        for data_index in range(0, len(train_data_x_rand), batch_size):
            loss = 0
            data_batch = train_data_x_rand[data_index:data_index+batch_size]
            doc_batch = [doc for doc in data_batch]
            logits = 0
            stock_logits = 0
            label_batch = np.array(train_data_y_rand[data_index:data_index+batch_size])
            label_batch_industry = train_data_y_rand_industry[data_index:data_index+batch_size]
            input_array_doc = []
            for doc_batch_index, input_ids in enumerate(doc_batch):
                    input_array = np.zeros(max_seq_length, dtype=np.int)
                    input_array[:min(max_seq_length, len(input_ids))] = input_ids[:min(max_seq_length, len(input_ids))]
                    input_array_doc.append(input_array)
            input_ids = torch.LongTensor(np.array(input_array_doc).astype(np.int32))
            label_logits, pooled_output, industry_logits = model(input_ids, 
                                                                                       labels= torch.LongTensor(label_batch), 
                                                                                       label_industry = label_batch_industry,                                                   
                                                                                       stock_vol = 0)


            logits  = label_logits
            industry_logits = industry_logits

            stock_similarity = torch.Tensor(stock_similarity_random[data_index:data_index+batch_size, data_index:data_index+batch_size])
            pooled_output_norm = torch.nn.functional.normalize(pooled_output, p=2, dim=1)
            doc2doc_similarity =  torch.matmul(pooled_output_norm, pooled_output_norm.T)

            loss_stock = loss_mse(doc2doc_similarity[stock_similarity != 0], stock_similarity[stock_similarity != 0].to("cuda"))
            loss_sector = loss_fct(logits,  torch.LongTensor(np.array(train_data_y_rand[data_index:data_index+batch_size])).to("cuda"))
            loss_industry = loss_fct(industry_logits, torch.LongTensor(np.array(train_data_y_rand_industry[data_index:data_index+batch_size])).to("cuda"))

            tr_loss += loss_sector.detach().to("cpu").item()
            if use_loss_sector:
                loss += loss_sector /(32/batch_size)
            if use_loss_industry:
                loss += loss_industry/(32/batch_size)
            if use_loss_stock:
                loss += (loss_stock/(32/batch_size))

            if use_sector_nm_loss:
                if ((((data_index + batch_size) % 32) == 0) or (data_index == (len(train_data_x_rand)-1))):
                    input_ids = torch.LongTensor(np.array(input_sector_ids_list).astype(np.int32))
                    label_sector_logits, pooled_sector_output = model(input_ids, labels= torch.LongTensor(np.sort(list(set(train_data_y_mask)))[1:]),  
                                                            label_industry = None,
                                                            labels_stock =  None, stock_vol = 0)

                    loss_regularize = loss_fct(label_sector_logits,  torch.LongTensor(np.array(np.sort(list(set(train_data_y_mask)))[1:])).to("cuda"))
                    loss += 0.01 * loss_regularize
            

            loss.backward()
            
            if ((((data_index + batch_size) % 32) == 0) or (data_index == (len(train_data_x_rand)-1))):
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1


        pred_label_valid, answer_label_valid, pred_industry_valid, answer_indesutry_valid, tr_loss_valid = evaluate_model(
        model, valid_data_x, valid_data_y, valid_data_industry, valid_size)
        valid_f1_score = sklearn.metrics.f1_score(answer_label_valid, pred_label_valid, average = "macro")
        if valid_f1_score > max_valid_f1_score:
            print ("epoch" + str(epoch ) +": " + str(tr_loss_valid/valid_size), valid_f1_score, " time:", str(time.time()-first_time))
            max_valid_f1_score = valid_f1_score
            #torch.save(model.module.state_dict(), "best_single_save_model")
            #model.module.to("cpu").save_pretrained(write_file_name)
            model.module.to("cuda")
        else:
            continue

In [None]:
train(max_epoch, write_file_name, use_loss_sector = True, use_loss_stock = True, use_sector_nm_loss = True)

epoch0: 0.2906120811836629 0.08699129720418267  time: 222.31745028495789
epoch1: 0.23491379015053374 0.2119137608149115  time: 342.1294662952423
epoch2: 0.19400462885446187 0.39992805313441177  time: 336.18289494514465
