In [None]:
import sys 
sys.path.append("./src")

In [1]:
import collections
import logging
import os
import torch
import MeCab
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, WordpieceTokenizer
from pytorch_pretrained_bert.tokenization import load_vocab
from pytorch_transformers import BertPreTrainedModel, BertModel, BertForSequenceClassification, BertConfig
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import pandas as pd
import numpy as np
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
from collections import defaultdict
import sklearn
import sklearn.metrics
import  matplotlib.pyplot as plt
import time
from sklearn.metrics import classification_report
import copy
from multiprocessing import Pool, cpu_count
import requests
import ml_metrics 
from torch import functional as F
import sys
import datetime
from dateutil.relativedelta import relativedelta

In [2]:
from bert_ja import *
from evaluation import *

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
tokenizer = BertMeCabTokenizer.from_pretrained('./models/PyTorchPretrainendModel/vocab.txt')

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file ./models/PyTorchPretrainendModel/vocab.txt


In [5]:
vocab_list = list(tokenizer.vocab.keys())

## Load Dataset

In [6]:
train_data_size = 2200
valid_size = 316
test_size = 500
gradient_accumulation_steps = 1
max_epoch = 5
max_seq_length = 512
batch_size = 8

In [7]:
df_data = pd.read_csv("dataset/ticker_list.csv", index_col=0)
df_data_text = pd.read_csv("dataset/text_data.csv")
sector2id = pickle.load(open("dataset/sector2id_dict.pkl", "rb"))
industry2id = pickle.load(open("dataset/industry2id_dict.pkl","rb"))
id2sector = dict(zip(list(sector2id.values()), list(sector2id.keys())))
id2industry = dict(zip(list(industry2id.values()), list(industry2id.keys())))
revised_month_stock_df_dict = pickle.load(open("revised_month_stock_df_dict.pkl", "rb"))
sec_code2companyname = dict(zip(df_data["ticker"], df_data["company_nm"] ))
use_sec_code_list_rev = np.sort(df_data["ticker"])

In [8]:
use_text_data = [np.array(text_id[1:-1].split(", ")).astype(np.int32) for text_id in df_data_text["text_id"]]
use_label_data = [sector2id[label] for label in df_data["sector17"]]
use_industry_data = [industry2id[label] for label in df_data["sector33"]]

In [9]:
seed = 0
np.random.seed(seed)
random_perm_all = np.random.permutation(len(use_text_data))

## Load Stock Data

In [10]:
use_sec_code_list_rev = np.sort(df_data["ticker"])
similarity_mat_df = pd.read_csv("stock_data/all_similarity_mat.csv",  index_col=0)
similarity_mat_df = similarity_mat_df.T[df_data["ticker"]].T[[str(val) for val in df_data["ticker"]]]

In [11]:
label_num = max(sector2id.values())+ 1
industry_num = max(industry2id.values())+ 1
similarity_inner_mat = np.array(similarity_mat_df)

In [12]:
def out_actual_predicted_list(sentence_representation_all, theme_word_list, pooled_output, use_sec_code_list_eval, th_val = 0):
    actual_list = []
    predicted_list = []
    for word, theme_vector_gpu in zip(theme_word_list, pooled_output):
        theme_vector = theme_vector_gpu.detach().to("cpu").numpy()
        theme_code_set = set(pd.read_csv("./dataset/stock_themes/" + word + ".tsv", sep = "\t", index_col = 0)["コード"])
        if np.mean(
        [(sec_code in theme_code_set) for sec_code in use_sec_code_list_eval]) <= th_val:
            continue
        #print (word)
        doc2theme_similarity_mat = sklearn.metrics.pairwise.cosine_similarity(sentence_representation_all, theme_vector.reshape(1, -1))
        actual = np.array(range(len(use_sec_code_list_eval)))[[(sec_code in theme_code_set) for sec_code in use_sec_code_list_eval]]
        predicted = np.argsort(doc2theme_similarity_mat[:,0])[-1::-1]
        actual_list.append(list(actual))
        predicted_list.append(list(predicted))
    return actual_list, predicted_list 

In [13]:
def save_vectors(folda_name, use_ticker_list_train, sentence_representation_all_train, 
                use_ticker_list_valid, sentence_representation_all_valid, 
                use_ticker_list_test, sentence_representation_all_test):
    with open(folda_name + "/train_ticker_vectors.txt", "w") as f:
        for ticker, vector in zip(use_ticker_list_train, sentence_representation_all_train):
            f.write("\t".join([str(ticker)] + [str(num) for num in vector]) + "\n")
    
    with open(folda_name + "/valid_ticker_vectors.txt", "w") as f:
        for ticker, vector in zip(use_ticker_list_valid, sentence_representation_all_valid):
            f.write("\t".join([str(ticker)] + [str(num) for num in vector]) + "\n")
    
    with open(folda_name + "/test_ticker_vectors.txt", "w") as f:
        for ticker, vector in zip(use_ticker_list_test, sentence_representation_all_test):
            f.write("\t".join([str(ticker)] + [str(num) for num in vector]) + "\n")

In [14]:
train_data_x = np.array(use_text_data)[random_perm_all][0:train_data_size]
train_data_y = np.array(use_label_data)[random_perm_all][0:train_data_size]
train_data_industry = np.array(use_industry_data)[random_perm_all][0:train_data_size]
train_inner_stock_mat = similarity_inner_mat[random_perm_all].T[random_perm_all][:train_data_size, :train_data_size]

In [15]:
valid_data_x = np.array(use_text_data)[random_perm_all][train_data_size: train_data_size+ valid_size]
valid_data_y = np.array(use_label_data)[random_perm_all][train_data_size: train_data_size+ valid_size]
valid_data_industry = np.array(use_industry_data)[random_perm_all][train_data_size: train_data_size+ valid_size]
valid_inner_stock_mat = similarity_inner_mat[random_perm_all].T[random_perm_all][
    train_data_size: train_data_size+ valid_size, train_data_size: train_data_size+ valid_size]

In [16]:
test_data_x = np.array(use_text_data)[random_perm_all][-test_size:]
test_data_y = np.array(use_label_data)[random_perm_all][-test_size:]
test_data_industry = np.array(use_industry_data)[random_perm_all][-test_size:]
test_inner_stock_mat = similarity_inner_mat[random_perm_all].T[random_perm_all][-test_size:, -test_size:]

In [17]:
total_train_examples = len(train_data_x)
test_data_size = len(test_data_y)

In [18]:
write_file_name = "models/save"

In [19]:
# all labels
model_file_name = './models/BertModel_Mean/Sector_Stock_Name_1/001/pytorch_model.bin' 
model_file_name = './models/BertModel_Mean/Sector_Name_1/pytorch_model.bin'
model_file_name = './models/BertModel_Mean/Sector_Stock_1/pytorch_model.bin'
model_file_name = './models/BertModel_Mean/Sector_1/pytorch_model.bin'
model_file_name = './models/BertModel_Mean/OnlyStock/pytorch_model.bin'

# 2 labels
model_file_name = './models/BertModel_Mean/Sector_Name_2labels/001/pytorch_model.bin'
model_file_name = './models/BertModel_Mean/Stock_Sector_Name_2labels/001/pytorch_model.bin'
# 5 labels
model_file_name = './models/BertModel_Mean/Sector_Name_3/001/pytorch_model.bin''
model_file_name = './models/BertModel_Mean/Sector_Stock_Name_3/001/pytorch_model.bin'

In [43]:
bert_config = BertConfig.from_pretrained('./models/finetuned_lm/')
#model_state_dict = torch.load(write_file_name + '/pytorch_model.bin')
model_state_dict = torch.load(model_file_name)
model = BertForSequenceMeanVec(bert_config, label_num, industry_num, state_dict=model_state_dict)
device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
model.to(device)

INFO:pytorch_transformers.modeling_utils:loading configuration file ./models/finetuned_lm/config.json
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 32005
}

INFO:pytorch_transformers.modeling_utils:loading configuration file ./models/PyTorchPretrainendModel/config.json
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inter

BertForSequenceMeanVec(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
          

In [44]:
n_gpu = 2
model.to(device)
if -1 != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError(
            "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [45]:
loss_fct = CrossEntropyLoss(ignore_index = -1)
loss_mse = MSELoss()

In [46]:
sentence_representation_all_test = evaluate_model_with_cosine_similarity(
    model, test_data_x, test_data_y, test_data_industry, test_data_size)

sector
same:  0.4040666
other:  0.2505557
industry
same:  0.45016152
other:  0.25452477


## Map@K

In [47]:
doc2soc_similarity_mat = sklearn.metrics.pairwise.cosine_similarity(sentence_representation_all_test )
sort_values = np.argsort(doc2soc_similarity_mat, axis = 1)[:,-1::-1]

In [48]:
predicted_list = [list(item[1:]) for item in sort_values]
actual_list_all= [list(np.array(range(len(test_data_y)))[np.array(test_data_y) == test_data_y[index]]) for index in range(len(test_data_y))]
actual_list_rev = []
for index, item in enumerate(actual_list_all):
    actual_list_rev.append(list(np.array(item)[np.array(item) != index]))

print ("17 sector")
for top_n in [5,10, 50]:
    print (top_n , ":",  ml_metrics.mapk(actual_list_rev, predicted_list, top_n))
    
    
predicted_list = [list(item[1:]) for item in sort_values]
actual_list_all= [list(np.array(range(len(test_data_industry)))[np.array(test_data_industry) == test_data_industry[index]]) for index in range(len(test_data_industry))]
actual_list_rev = []
for index, item in enumerate(actual_list_all):
    actual_list_rev.append(list(np.array(item)[np.array(item) != index]))

print ("33 sector")
for top_n in [5,10, 50]:
    print (top_n , ":",  ml_metrics.mapk(actual_list_rev, predicted_list, top_n))

17 sector
5 : 0.4723583333333333
10 : 0.40502214285714283
50 : 0.3247384807553308
33 sector
5 : 0.39632333333333325
10 : 0.33754092813051145
50 : 0.27227105958480513


## Evaluation for Theme Extraction

In [49]:
cos = nn.CosineSimilarity(dim=2, eps=1e-6)
use_sec_code_list_eval = np.array(use_sec_code_list_rev)[random_perm_all][-test_size:]
theme_word_list = [word.replace(".tsv", "") for word in os.listdir("./dataset/stock_themes")]

In [50]:
theme_token_list = []
for word in theme_word_list:
    text = "[CLS] " + word +  " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    tokenized_id = tokenizer.convert_tokens_to_ids(tokenized_text)
    theme_token_list.append(tokenized_id)

max_seq_length_company = 8
input_ids_list = []
for tokenized_id in theme_token_list:
    input_array = np.zeros(max_seq_length_company, dtype=np.int)
    input_array[:min(max_seq_length_company, len(tokenized_id))] = tokenized_id[:min(max_seq_length_company, len(tokenized_id))]
    input_ids_list.append(input_array)

In [51]:
input_ids = torch.LongTensor(np.array(input_ids_list).astype(np.int32))
label_logits, pooled_output = model(input_ids, labels= None,  
                                                label_industry = None,
                                                labels_stock =  None, stock_vol = 0)

In [53]:
actual_list, predicted_list  = out_actual_predicted_list(sentence_representation_all_test, 
                                                             theme_word_list, pooled_output, use_sec_code_list_eval, 0.00)
print ("StockTheme")
print (len(actual_list))
for k in [5,10,50]:
        print (k, ":", ml_metrics.mapk(actual_list, predicted_list, k))

StockTheme
274
5 : 0.17610097323600973
10 : 0.16069969843844537
50 : 0.14372232991205527
