In [2]:
import json
from collections import defaultdict
from mongo_handler import MongoHandler

def get_percentage_changes_training_corpus(k_fold=1):
    mongo_handler_obj = MongoHandler()
    mongo_handler_obj.connect_to_mongo()
    db = mongo_handler_obj.get_database()
    list_of_input_data = []
    data = db["input_data"].find({"is_used": True, "year": {"$ne": 2021}})
    for item in data:
        list_of_input_data.append(item)
   
    dict_of_percentage_changes = defaultdict(list)
    for input_data in list_of_input_data:
        
        dict_of_percentage_changes["_id"].append(input_data["_id"])
        dict_of_percentage_changes["percentage_change"].append(input_data["percentage_change"])
        dict_of_percentage_changes["percentage_change_min_max"].append(input_data["percentage_change_scaled_min_max"][str(k_fold)])
        dict_of_percentage_changes["percentage_change_standard"].append(input_data["percentage_change_scaled_standard"][str(k_fold)])
        dict_of_percentage_changes["percentage_change_robust"].append(input_data["percentage_change_scaled_robust"][str(k_fold)])
            
    return dict_of_percentage_changes
    

In [3]:
dict_of_percentage_changes_train = get_percentage_changes_training_corpus()

In [4]:
import plotly.figure_factory as ff
import numpy as np

hist_data = [dict_of_percentage_changes_train["percentage_change"], dict_of_percentage_changes_train["percentage_change_standard"], dict_of_percentage_changes_train["percentage_change_min_max"], dict_of_percentage_changes_train["percentage_change_robust"]]

group_labels = ['percentage_change', "percentage_change_standard", "percentage_change_min_max", "percentage_change_robust"]

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

In [36]:
import numpy as np

labels = np.array(dict_of_percentage_changes_train["percentage_change"])

mean = np.mean(labels)
standard_deviation = np.std(labels)
distance_from_mean = abs(labels - mean)
# Sigma 2
max_deviations = 2
not_outlier = distance_from_mean < max_deviations * standard_deviation
no_outliers2 = labels[not_outlier]
# Sigma 3
max_deviations = 3
not_outlier = distance_from_mean < max_deviations * standard_deviation
no_outliers3 = labels[not_outlier]

import plotly.figure_factory as ff
import numpy as np

hist_data = [no_outliers2, no_outliers3]

group_labels = ['percentage_change_removed_outliers_sigma2', 'percentage_change_removed_outliers_sigma3']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

In [37]:
print(f"Samples before removing outliers: {len(labels)}")
print(f"Samples after removing outliers - sigma 2: {len(no_outliers2)}")
print(f"Samples after removing outliers - sigma 3: {len(no_outliers3)}")

Samples before removing outliers: 2166
Samples after removing outliers - sigma 2: 2103
Samples after removing outliers - sigma 3: 2131


In [38]:
# For sigma 2 and 3 
list_of_ids_to_remove_sigma2=[]
list_of_ids_to_remove_sigma3=[]
for item in labels:
    if item not in no_outliers2:
        idx_of_id = dict_of_percentage_changes_train["percentage_change"].index(item)
        list_of_ids_to_remove_sigma2.append(str(dict_of_percentage_changes_train["_id"][idx_of_id]))
    if item not in no_outliers3:
        idx_of_id = dict_of_percentage_changes_train["percentage_change"].index(item)
        list_of_ids_to_remove_sigma3.append(str(dict_of_percentage_changes_train["_id"][idx_of_id]))

In [39]:
list_of_ids_to_remove_sigma3

['628b9aa8d6c2de67e0c1a806',
 '628b9aa9cd3956a068c1a809',
 '628b9aaa800f2af4c0c1a809',
 '628b9abf3ea0e0086dc1a809',
 '628b9ac3d5ad578a08c1a809',
 '628b9ac40b1c4bfcd8c1a809',
 '628b9ac44745f6cc25c1a809',
 '628b9ac610db4138fdc1a809',
 '628b9aca308235fd01c1a809',
 '628b9ad500f4640003c1a809',
 '628b9ada6ae9b88e64c1a809',
 '628b9ae2f1b00d1f16c1a809',
 '628b9ae284b11b075cc1a807',
 '628b9ae284b11b075cc1a809',
 '628b9aea55569bea2cc1a808',
 '628b9af0270a8c5bc0c1a807',
 '628b9af684c8ce6b05c1a809',
 '628b9afba50aac7853c1a809',
 '628b9b00d872e5ece9c1a7fc',
 '628b9b0894a2c7834ec1a802',
 '628b9b0894a2c7834ec1a805',
 '628b9b091a9c0bbb15c1a809',
 '628b9b138c2aeac1cdc1a806',
 '628b9b143a0de5b1eac1a806',
 '628b9b161380c07e33c1a809',
 '628b9b1cad481bd293c1a809',
 '628b9b23d7066f2587c1a806',
 '628b9b267070d04c7ec1a809',
 '628b9b26aedd086a82c1a801',
 '628b9b2fba911aa031c1a802',
 '628b9b395f51f73a68c1a807',
 '628b9b395f51f73a68c1a809',
 '628b9b3cc18689d98bc1a809',
 '628b9b3dedbb1a2982c1a809',
 '628b9b3f9d89

## Pre-training Adapter results

In [1]:
import pandas as pd
import os
from collections import defaultdict

dict_of_dfs_list = defaultdict(list)

for df_path in os.listdir("data/results/pre-train-adapter"):
    dict_of_dfs_list[df_path.split("tag-")[1]].append(pd.read_csv(os.path.join("data/results/pre-train-adapter", df_path)))

In [9]:
concat_eval_loss_df = None
concat_train_loss_df = None
concat_micro_f1_df = None
concat_macro_f1_df = None

for k, v in dict_of_dfs_list.items():
    concated_df = pd.concat(v)
    grouped_step_df = concated_df.groupby("Step")
    dict_of_dfs_list[k] = grouped_step_df.mean()

In [20]:
import plotly.express as px

loss_df_merged = dict_of_dfs_list["loss.csv"].merge(dict_of_dfs_list["eval_loss.csv"], on='Step', how='inner', suffixes=("_train", "_val"))
loss_df_merged.reset_index(inplace=True)

fig = px.line(loss_df_merged, x="Step", y=["Value_train", "Value_val"], title=f"Pre-training Adapter train and validation loss average for all k-folds.")
fig.show()

In [19]:
import plotly.express as px

macro_micro_df_merged = dict_of_dfs_list["eval_macro_F1.csv"].merge(dict_of_dfs_list["eval_micro_F1.csv"], on='Step', how='inner', suffixes=("_macro_F1", "_micro_F1"))
macro_micro_df_merged.reset_index(inplace=True)

fig = px.line(macro_micro_df_merged, x="Step", y=["Value_macro_F1", "Value_micro_F1"], title=f"Pre-training Adapter macro and micro F1 scores on the validation set average for all k-folds.")
fig.show()

## Try KPI model

In [7]:
import pandas as pd
import os
from collections import defaultdict

dict_of_dfs_list_kpi = defaultdict(list)

for df_path in os.listdir("data/results/kpi_model"):
    dict_of_dfs_list_kpi[df_path.split("tag-")[1]].append(pd.read_csv(os.path.join("data/results/kpi_model", df_path)))

In [9]:
for k, v in dict_of_dfs_list_kpi.items():
    concated_df = pd.concat(v)
    grouped_step_df = concated_df.groupby("Step")
    dict_of_dfs_list_kpi[k] = grouped_step_df.mean()

In [27]:
import plotly.express as px

loss_df_merged = dict_of_dfs_list_kpi["loss.csv"].merge(dict_of_dfs_list_kpi["eval_loss.csv"], on='Step', how='inner', suffixes=("_train", "_eval"))
loss_df_merged.reset_index(inplace=True)

fig = px.line(loss_df_merged[150:], x="Step", y=["Value_train", "Value_eval"], title=f"")
fig.show()

In [14]:
loss_df_merged[:10]

Unnamed: 0,Step,Wall time_train,Value_train,Wall time_eval,Value_eval
0,5,1653320000.0,11041386.0,1653320000.0,48648.453125
1,15,1653320000.0,26225686.0,1653320000.0,15470.553711
2,18,1653320000.0,50683136.0,1653320000.0,13541.374023
3,24,1653320000.0,27561696.0,1653320000.0,8825.703125
4,40,1653320000.0,18896124.0,1653320000.0,3080.79541
5,42,1653320000.0,85481616.0,1653320000.0,1924.031006
6,44,1653320000.0,1217748.5,1653320000.0,2487.415527
7,45,1653320000.0,26653106.0,1653320000.0,3228.400391
8,47,1653320000.0,3989005.5,1653320000.0,2671.712646
9,53,1653320000.0,4578338.5,1653320000.0,2983.029541


In [15]:
from torch import nn

class KPIModel(nn.Module):
    def __init__(self):
        super(KPIModel, self).__init__()
        self.input_size = 116
        self.num_classes = 1
        self.hidden_size = 64
        self.hidden_layers = 1

        if self.hidden_layers == 0:
            self.layers = nn.Linear(self.input_size, self.num_classes)
        elif self.hidden_layers == 1:
            self.layers = nn.Sequential(
                nn.Linear(self.input_size, self.hidden_size),
                nn.Dropout(0.2),
                nn.ReLU(),
                nn.Linear(self.hidden_size, self.num_classes),
            )

    def forward(self, x, labels=None):
        outputs = self.layers(x)
        
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(outputs, labels)
            outputs = (loss, outputs)
            
        return outputs

In [16]:
import torch
model = KPIModel()
model.load_state_dict(torch.load("D:/PythonProjects/K-Adapter/output/kpi_symbolic_kpi-symbolic_percentage_change_robust_kfold-1_batch-64_lr-5e-05_warmup-0_epoch-2000.0_comment-/kpi_pytorch_model.bin", map_location=torch.device('cpu')))
model.eval()

KPIModel(
  (layers): Sequential(
    (0): Linear(in_features=116, out_features=64, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [17]:
import pickle
from collections import defaultdict
from mongo_handler import MongoHandler

list_of_features_dicts = [
            "fundamental_data_imputed_full",
            "fundamental_data_diff_self_t_1",
            "fundamental_data_diff_self_t_2",
            "fundamental_data_diff_industry_t",
            "fundamental_data_diff_industry_t_1",
            "fundamental_data_diff_industry_t_2",
        ]

def run_kpi_model_val_data_per_k_fold(model, k_fold=1, perc_change_type="standard"):
    model.eval()
    mongo_handler_obj = MongoHandler()
    mongo_handler_obj.connect_to_mongo()
    db = mongo_handler_obj.get_database()
    data = db["input_data"].find({"is_used": True, f"k_fold_config.{k_fold}": "val"})
    scaler = db["storage"].find_one({"name": perc_change_type, "k_fold": k_fold})
    # for min_max_storage in storages_min_max:
    scaler = pickle.loads(scaler["dumped_object"])

    dict_of_results = defaultdict(dict)
    for input_data in data:
        list_features = []
        for features_dict in list_of_features_dicts:
            list_features += list(input_data[features_dict].values())
        if input_data["is_filing_on_time"]:
            list_features += [0, 1]
        else:
            list_features += [1, 0]
        
        with torch.no_grad():
            curr_loss, curr_output = model(torch.FloatTensor(list_features), torch.FloatTensor([input_data[f"percentage_change_scaled_{perc_change_type}"][str(k_fold)]]))
        
        inversed_output = scaler.inverse_transform(curr_output.reshape(1,-1))
        
        dict_of_results[str(input_data["_id"])]['loss'] = curr_loss
        dict_of_results[str(input_data["_id"])]['actual'] = input_data["percentage_change"]
        dict_of_results[str(input_data["_id"])]['predicted_inv'] = inversed_output
        dict_of_results[str(input_data["_id"])]['act_pred_diff_inv'] = abs(inversed_output - input_data["percentage_change"])
        dict_of_results[str(input_data["_id"])]['actual_scaled'] = input_data[f"percentage_change_scaled_{perc_change_type}"][str(k_fold)]
        dict_of_results[str(input_data["_id"])]['predicted_scaled'] = curr_output.item()
        dict_of_results[str(input_data["_id"])]['act_pred_diff_scaled'] = abs(curr_output.item() - input_data[f"percentage_change_scaled_{perc_change_type}"][str(k_fold)])
    
    return dict_of_results

In [18]:
import numpy as np

dict_of_results = run_kpi_model_val_data_per_k_fold(model, 1, "robust")

In [22]:
dict_of_results

defaultdict(dict,
            {'62795155199c6e379f89949c': {'loss': tensor(0.8987),
              'actual': -23.060480262395664,
              'predicted_inv': array([[0.85530616]]),
              'act_pred_diff_inv': array([[23.91578642]]),
              'actual_scaled': -0.9583676195443673,
              'predicted_scaled': -0.010371387004852295,
              'act_pred_diff_scaled': 0.947996232539515},
             '62795156199c6e379f8994a1': {'loss': tensor(0.5261),
              'actual': -16.504501348086634,
              'predicted_inv': array([[1.7944044]]),
              'act_pred_diff_inv': array([[18.29890575]]),
              'actual_scaled': -0.6984956168912404,
              'predicted_scaled': 0.026853464543819427,
              'act_pred_diff_scaled': 0.7253490814350598},
             '62795156199c6e379f8994a6': {'loss': tensor(13.0538),
              'actual': 98.70129870129867,
              'predicted_inv': array([[7.55337211]]),
              'act_pred_diff_inv': ar

In [20]:
# Testing

mongo_handler_obj = MongoHandler()
mongo_handler_obj.connect_to_mongo()
db = mongo_handler_obj.get_database()
    
robust_scaler = db["storage"].find_one({"name": "robust", "k_fold": 1})
    # for min_max_storage in storages_min_max:
robust_scaler = pickle.loads(robust_scaler["dumped_object"])

In [21]:
robust_scaler.inverse_transform([[-0.4139048159122467]])

array([[-9.32492344]])

In [23]:
from math import pow

"""
Using minmax
"""

# Possible predictions
pred_low = 0.21
pred = -0.010371387004852295

# Possible real
real = -0.9583676195443673

print(pow((pred_low-real), 2))
print(pow((pred-real), 2))

1.3650828943997715
0.8986968569091143
