## Getting input data from DB

In [1]:
import json
import requests

list_of_years = [2017, 2018, 2019, 2020, 2021]
list_of_qs = [1, 2, 3, 4]

dict_of_res = {}
for year in list_of_years:
    for q in list_of_qs:
        # Exclude 2017 quarter 1:
        if year == 2017 and q == 1:
            continue
        
        resp = json.loads(requests.get(f"http://localhost:8000/api/v1/input_data/{year}/{q}").text)
        if resp["code"] == 200:
            dict_of_res[str(year) + "_" + str(q)] = resp["data"]

## Get the sectors and industries of the input data

In [7]:
from collections import defaultdict

dict_of_industry_counts = defaultdict(int)
dict_of_industry_counts_per_year_q = defaultdict(lambda: defaultdict(int))
for year_q_k, inputs_data in dict_of_res.items():
    for input_data in inputs_data:
        dict_of_industry_counts[input_data["industry"]] += 1
        dict_of_industry_counts_per_year_q[year_q_k][input_data["industry"]] += 1

In [58]:
dict_of_industry_counts_per_year_q

defaultdict(<function __main__.<lambda>()>,
            {'2017_2': defaultdict(int,
                         {'TRUCKING (NO LOCAL)': 12,
                          'DRILLING OIL & GAS WELLS': 3,
                          'MINING & QUARRYING OF NONMETALLIC MINERALS (NO FUELS)': 9,
                          'CRUDE PETROLEUM & NATURAL GAS': 38,
                          'OIL & GAS FIELD SERVICES, NEC': 10,
                          'ARRANGEMENT OF TRANSPORTATION OF FREIGHT & CARGO': 3,
                          'NATURAL GAS DISTRIBUTION': 3,
                          'RAILROADS, LINE-HAUL OPERATING': 4,
                          'BITUMINOUS COAL & LIGNITE SURFACE MINING': 2,
                          'METAL MINING': 4,
                          'AIR TRANSPORTATION, SCHEDULED': 9,
                          'HAZARDOUS WASTE MANAGEMENT': 3,
                          'REFUSE SYSTEMS': 4,
                          'NATURAL GAS TRANSMISSION': 7,
                          'TRANSPORTATION SERVICES

In [None]:
dict_of_industry_counts

## Check how useful is past data approach for the companies

In [62]:
from collections import defaultdict

dict_of_cik_imputation_roes_count = defaultdict(lambda: defaultdict(int))
for year_q_k, inputs_data in dict_of_res.items():
    for input_data in inputs_data:
        if input_data["cik"] not in dict_of_cik_imputation_roes_count.keys():
            list_of_roes = list(input_data["fundamental_data"].keys())
            dict_of_cik_imputation_roes_count[input_data['cik']] = dict.fromkeys(list_of_roes, 0)
        
        for roe, val in input_data["fundamental_data_imputed"].items():
            if val:
                dict_of_cik_imputation_roes_count[input_data['cik']][roe] += 1

dict_of_cik_imputation_roes_count
        

defaultdict(<function __main__.<lambda>()>,
            {799233: {'roe': 18,
              'roa': 18,
              'roce': 18,
              'gross_margin': 18,
              'operating_profit_margin': 18,
              'net_profit_margin': 18,
              'debt_to_equity': 18,
              'equity': 18,
              'debt': 8,
              'current': 18,
              'quick': 18,
              'cash': 18,
              'times_interest_earned': 17,
              'capex_to_operating_cash': 18,
              'operating_cash_flow': 18,
              'price_to_earnings': 18,
              'ev_ebitda': 8,
              'ev_ebit': 8,
              'ev_revenue': 8},
             1537028: {'roe': 18,
              'roa': 18,
              'roce': 18,
              'gross_margin': 18,
              'operating_profit_margin': 18,
              'net_profit_margin': 18,
              'debt_to_equity': 18,
              'equity': 18,
              'debt': 18,
              'current': 18,
   

In [61]:
list_of_roes

['roe',
 'roa',
 'roce',
 'gross_margin',
 'operating_profit_margin',
 'net_profit_margin',
 'debt_to_equity',
 'equity',
 'debt',
 'current',
 'quick',
 'cash',
 'times_interest_earned',
 'capex_to_operating_cash',
 'operating_cash_flow',
 'price_to_earnings',
 'ev_ebitda',
 'ev_ebit',
 'ev_revenue']

## Create a POC model for finding most similar company based on KPIs

In [53]:
from collections import defaultdict

dict_of_year_q_fund_data = defaultdict(list)

for year_q_k, inputs_data in dict_of_res.items():
    for input_data in inputs_data:
        dict_of_year_q_fund_data[year_q_k].append((input_data["cik"], input_data["industry"], input_data['fundamental_data']))

In [54]:
from sklearn.impute import KNNImputer

imputer_test = KNNImputer(n_neighbors=1)

dict_of_2017_2_data = {}
dict_of_cik_industry = {}
for cik, industry, fun_data_dict in dict_of_year_q_fund_data["2017_2"]:
    dict_of_2017_2_data[cik] = list(fun_data_dict.values())
    dict_of_cik_industry[cik] = industry
imputer_test.fit(list(dict_of_2017_2_data.values()))

KNNImputer(n_neighbors=1)

In [55]:
count = 0
for to_be_imputed_cik, to_be_imputed_data in dict_of_2017_2_data.items():
    if None in to_be_imputed_data:
        count+=1
        
print(count)

53


In [57]:
for to_be_imputed_cik, to_be_imputed_data in dict_of_2017_2_data.items():
    if None not in to_be_imputed_data:
        continue
    
    dict_of_none_idx_imputed_val = {}
    curr_res = imputer_test.transform([to_be_imputed_data])[0]
    for idx, item in enumerate(to_be_imputed_data):
        if not item:
            dict_of_none_idx_imputed_val[idx] = curr_res[idx]
    
    # Find the one that imputation is done from
    for cik, data in dict_of_2017_2_data.items():
        if None not in data:
            count_matches = 0
            desired_matched = len(dict_of_none_idx_imputed_val)
            for idx, item in enumerate(data):
                if dict_of_none_idx_imputed_val.get(idx, None) == item:
                    count_matches += 1
            if count_matches == desired_matched:
                print(f"Imputed {dict_of_cik_industry[to_be_imputed_cik]} {to_be_imputed_cik} - {curr_res.tolist()}")
                print(f"Original {dict_of_cik_industry[cik]} {cik} - {data}")
                print('------------------------------------')
                break
        

Imputed CRUDE PETROLEUM & NATURAL GAS 1602065 - [33.49399573891149, 0.029818507342006095, 0.031009746746506524, 0.8443386080811185, 0.3295437087110155, 0.31728376094638194, 0.0012606293316062326, 0.998740957853853, 0.00042271977905846217, 51.5355504587156, 43.325688073394495, 32.75573394495413, 35.049019607843135, 5.088005595057699, 50.05733944954128, 61.0980352280074, 9.797949410908199, 15.812036005839062, 5.86761095692794]
Original TRANSPORTATION SERVICES 40211 - [24.090434782608696, 0.008102129098620525, 0.01653533317357166, 0.39006643467257196, 0.37108509965200887, 0.18190446061372983, 4.123375685821542, 0.19518381265059392, 0.00042271977905846217, 74.06666666666666, 74.06666666666666, 51.733333333333334, 2.99234693877551, 0.4007751937984496, 17.233333333333334, 34.90350997365082, 9.797949410908199, 15.812036005839062, 5.86761095692794]
------------------------------------
Imputed ELECTRIC SERVICES 1296445 - [31.421009262942547, 0.014078230704735573, 0.02666248472668565, 0.39184193

## Get the fundamental data avg dict for all years and all quarters

In [2]:
dict_fund_data_avg_per_year_q = {}
for year_q, input_data_list in dict_of_res.items():
    dict_fund_data_avg_per_year_q[year_q] = input_data_list[0]["fundamental_data_avg"]

In [3]:
import pandas as pd

list_of_dicts_for_df = []
for year_q, dict_info in dict_fund_data_avg_per_year_q.items():
    # print(year_q)
    for filer_k, dict_kpis in dict_info.items():
        for kpi_k, dict_counts in dict_kpis.items():
            dict_row = {}
            # if kpi_k == 'roe':
            #     print(filer_k.upper())
            # print(f"{kpi_k.upper()} | all: {dict_counts['count_all']} | used: {dict_counts['count_used']} | median: {dict_counts['median']} | mean: {dict_counts['mean']} | not on time: {dict_counts.get('count_not_on_time', 0)} | used to all: {dict_counts['count_used']/dict_counts['count_all']}")
            # print(f"{kpi_k.upper()} | used to all: {round(dict_counts['count_used']/dict_counts['count_all'], 2)}")
            
            list_of_dicts_for_df.append({"year_q": year_q, "filer_type": filer_k, "kpi": kpi_k, "used_to_all": round(dict_counts['count_used']/dict_counts['count_all'], 2)})
            
    #     print('-----------------------------------')
    # print('-----------NEW ITERATION-----------')
    
df_for_plot = pd.DataFrame(list_of_dicts_for_df)

In [8]:
df_for_plot[(df_for_plot["year_q"] == "2019_3") & (df_for_plot["filer_type"] == "non_accelerated_filer")]

Unnamed: 0,year_q,filer_type,kpi,used_to_all
551,2019_3,non_accelerated_filer,roe,0.8
552,2019_3,non_accelerated_filer,roa,0.8
553,2019_3,non_accelerated_filer,roce,0.7
554,2019_3,non_accelerated_filer,gross_margin,0.8
555,2019_3,non_accelerated_filer,operating_profit_margin,0.8
556,2019_3,non_accelerated_filer,net_profit_margin,0.8
557,2019_3,non_accelerated_filer,debt_to_equity,0.7
558,2019_3,non_accelerated_filer,equity,0.8
559,2019_3,non_accelerated_filer,debt,0.6
560,2019_3,non_accelerated_filer,current,0.7


In [33]:
import plotly.express as px

fig = px.histogram(df_for_plot, x="used_to_all", color="kpi", marginal="rug",
                   hover_data=df_for_plot.columns, title="Percentage used KPIs from all for calculating mean/median | color: KPI")
fig.show()

In [35]:
import plotly.express as px

fig = px.histogram(df_for_plot, x="used_to_all", color="year_q", marginal="rug",
                   hover_data=df_for_plot.columns, title="Percentage used KPIs from all for calculating mean/median | color: year Q")
fig.show()

In [36]:
import plotly.express as px

fig = px.histogram(df_for_plot, x="used_to_all", color="filer_type", marginal="rug",
                   hover_data=df_for_plot.columns, title="Percentage used KPIs from all for calculating mean/median | color: Filer type")
fig.show()