## Getting input data from DB

In [25]:
import json
import requests

list_of_years = [2017, 2018, 2019, 2020, 2021]
list_of_qs = [1, 2, 3, 4]

dict_of_res = {}
for year in list_of_years:
    for q in list_of_qs:
        resp = json.loads(requests.get(f"http://localhost:8000/api/v1/input_data/{year}/{q}").text)
        if resp["code"] == 200:
            dict_of_res[str(year) + "_" + str(q)] = resp["data"]
        else:
            print(year, q, resp["code"])

2017 1 404
2021 4 404


In [26]:
set_ciks = set()
num_samples = 0
for year_q, list_of_data in dict_of_res.items():
    for data in list_of_data:
        set_ciks.add(data['cik'])
        num_samples += 1
        
print(len(set_ciks))
print(num_samples)

198
3184


## Get the sectors and industries of the input data

In [27]:
from collections import defaultdict

dict_of_industry_counts = defaultdict(int)
dict_of_industry_counts_per_year_q = defaultdict(lambda: defaultdict(int))
for year_q_k, inputs_data in dict_of_res.items():
    for input_data in inputs_data:
        dict_of_industry_counts[input_data["industry"]] += 1
        dict_of_industry_counts_per_year_q[year_q_k][input_data["industry"]] += 1

In [28]:
dict_of_industry_counts_per_year_q

defaultdict(<function __main__.<lambda>()>,
            {'2017_2': defaultdict(int,
                         {'DRILLING OIL & GAS WELLS': 4,
                          'MINING & QUARRYING OF NONMETALLIC MINERALS (NO FUELS)': 9,
                          'OIL & GAS FIELD SERVICES, NEC': 13,
                          'CRUDE PETROLEUM & NATURAL GAS': 38,
                          'NATURAL GAS TRANSMISSION': 9,
                          'AIR TRANSPORTATION, SCHEDULED': 9,
                          'REFUSE SYSTEMS': 6,
                          'PETROLEUM REFINING': 12,
                          'ARRANGEMENT OF TRANSPORTATION OF FREIGHT & CARGO': 4,
                          'TRUCKING (NO LOCAL)': 12,
                          'TRANSPORTATION SERVICES': 6,
                          'PIPE LINES (NO NATURAL GAS)': 7,
                          'ELECTRIC SERVICES': 9,
                          'WATER SUPPLY': 7,
                          'DEEP SEA FOREIGN TRANSPORTATION OF FREIGHT': 4,
         

In [None]:
dict_of_industry_counts

In [None]:
from collections import defaultdict

# Check if industry count changes between quarters
dict_of_industries_check = defaultdict(set)
for year_q, dict_industries_count in dict_of_industry_counts_per_year_q.items():
    for k_industry, count_val in dict_industries_count.items():
        dict_of_industries_check[k_industry].add(count_val)
        
dict_of_industries_check

## Check how useful is past data approach for the companies

In [9]:
from collections import defaultdict

dict_of_cik_imputation_kpis_values = defaultdict(lambda: defaultdict(list))
for year_q_k, inputs_data in dict_of_res.items():
    for input_data in inputs_data:
        for kpi, val in input_data["fundamental_data"].items():
            if not val:
                dict_of_cik_imputation_kpis_values[input_data['cik']][kpi].append((val, input_data["fundamental_data_imputed_past"].get(kpi, None)))
            else:
                dict_of_cik_imputation_kpis_values[input_data['cik']][kpi].append(val)
        
        # for kpi, val in input_data["fundamental_data_imputed"].items():
        #     if kpi not in list_of_kpis:
        #         dict_of_cik_imputation_kpis_count[input_data['cik']][kpi].append(None)
        #     else:
        #         dict_of_cik_imputation_kpis_count[input_data['cik']][kpi].append(val)

In [None]:
dict_of_cik_imputation_kpis_values

In [11]:
dict_of_cik_consecutive_dicts = {}
for cik, dict_of_list in dict_of_cik_imputation_kpis_values.items():
    dict_of_kpi_count_consecutive_missing = defaultdict(list)
    for kpi, list_of_values in dict_of_list.items():
        is_seq = False
        for idx in range(len(list_of_values)):
            curr = list_of_values[idx]
            if type(curr) is tuple:
                if not curr[0] and not is_seq:
                    if idx == 0:
                        kpi = kpi + '_' + "BEGIN_MISSING" 
                    dict_of_kpi_count_consecutive_missing[kpi].append(1)
                    curr_sequence_idx = len(dict_of_kpi_count_consecutive_missing[kpi]) - 1
                    is_seq = True
                elif not curr[0] and is_seq:
                    dict_of_kpi_count_consecutive_missing[kpi][curr_sequence_idx] += 1
                elif curr[0]:
                    is_seq = False
            else:
                is_seq = False
                    
    dict_of_cik_consecutive_dicts[cik] = dict_of_kpi_count_consecutive_missing
             

In [None]:
dict_of_cik_consecutive_dicts

In [13]:
import json
# Save as json
dict_of_cik_consecutive_dicts_copy = dict_of_cik_consecutive_dicts.copy()
for cik, consecutive_dicts in dict_of_cik_consecutive_dicts_copy.copy().items():
    if not consecutive_dicts:
        del dict_of_cik_consecutive_dicts_copy[cik]

with open('./data/json_cik_consecutives.json', 'w') as f:
    json.dump(dict_of_cik_consecutive_dicts_copy, f)

In [20]:
dict_of_kpi_count = defaultdict(int)
for cik, consecutive_dicts in dict_of_cik_consecutive_dicts.items():
    is_cik_printed = False
    for kpi_k, list_counts in consecutive_dicts.items():
        for count in list_counts:
            if count >= 3:
                if not is_cik_printed:
                    print(cik)
                    is_cik_printed = True
                    
                print(kpi_k, list_counts)
                break
    if is_cik_printed:
        print('--------------------')

799233
debt_BEGIN_MISSING [11, 4, 3]
times_interest_earned_BEGIN_MISSING [2, 4, 4]
ev_ebitda_BEGIN_MISSING [11, 4, 3]
ev_ebit_BEGIN_MISSING [11, 4, 3]
ev_revenue_BEGIN_MISSING [11, 4, 3]
--------------------
746515
debt_BEGIN_MISSING [9]
times_interest_earned_BEGIN_MISSING [17]
ev_ebitda_BEGIN_MISSING [9]
ev_ebit_BEGIN_MISSING [9]
ev_revenue_BEGIN_MISSING [9]
--------------------
1134115
gross_margin_BEGIN_MISSING [4, 1, 1, 2, 1, 1, 3]
operating_profit_margin_BEGIN_MISSING [4, 1, 1, 2, 1, 1, 3]
net_profit_margin_BEGIN_MISSING [4, 1, 1, 2, 1, 1, 3]
debt_BEGIN_MISSING [20]
quick_BEGIN_MISSING [20]
times_interest_earned_BEGIN_MISSING [1, 5]
capex_to_operating_cash_BEGIN_MISSING [4, 8]
ev_ebitda_BEGIN_MISSING [20]
ev_ebit_BEGIN_MISSING [20]
ev_revenue_BEGIN_MISSING [20]
--------------------
1308208
quick_BEGIN_MISSING [3, 3, 3]
capex_to_operating_cash [3]
operating_cash_flow_BEGIN_MISSING [1, 3]
--------------------
1512499
quick_BEGIN_MISSING [20]
--------------------
1283140
capex_to_ope

In [15]:
dict_of_kpi_count

defaultdict(int, {})

## Create a POC model for finding most similar company based on KPIs

In [4]:
from collections import defaultdict

dict_of_year_q_fund_data = defaultdict(list)

for year_q_k, inputs_data in dict_of_res.items():
    for input_data in inputs_data:
        dict_of_year_q_fund_data[year_q_k].append((input_data["cik"], input_data["industry"], input_data['fundamental_data']))

In [6]:
from sklearn.impute import KNNImputer

imputer_test = KNNImputer(n_neighbors=1)

dict_of_2017_2_data = {}
dict_of_cik_industry = {}
for cik, industry, fun_data_dict in dict_of_year_q_fund_data["2017_2"]:
    dict_of_2017_2_data[cik] = list(fun_data_dict.values())
    dict_of_cik_industry[cik] = industry
imputer_test.fit(list(dict_of_2017_2_data.values()))

KNNImputer(n_neighbors=1)

In [7]:
count = 0
for to_be_imputed_cik, to_be_imputed_data in dict_of_2017_2_data.items():
    if None in to_be_imputed_data:
        count+=1
        
print(count)

50


In [None]:
for to_be_imputed_cik, to_be_imputed_data in dict_of_2017_2_data.items():
    if None not in to_be_imputed_data:
        continue
    
    dict_of_none_idx_imputed_val = {}
    curr_res = imputer_test.transform([to_be_imputed_data])[0]
    for idx, item in enumerate(to_be_imputed_data):
        if not item:
            dict_of_none_idx_imputed_val[idx] = curr_res[idx]
    
    # Find the one that imputation is done from
    for cik, data in dict_of_2017_2_data.items():
        if None not in data:
            count_matches = 0
            desired_matched = len(dict_of_none_idx_imputed_val)
            for idx, item in enumerate(data):
                if dict_of_none_idx_imputed_val.get(idx, None) == item:
                    count_matches += 1
            if count_matches == desired_matched:
                print(f"Imputed {dict_of_cik_industry[to_be_imputed_cik]} {to_be_imputed_cik} - {curr_res.tolist()}")
                print(f"Original {dict_of_cik_industry[cik]} {cik} - {data}")
                print('------------------------------------')
                break
        

## Create a POC model for finding most similar company by industry

In [20]:
# Take into account only companies on time
for year_q_k, list_of_data in dict_of_res.items():
    dict_of_industry_cik_list_values = defaultdict(lambda: defaultdict(list))
    for input_data in list_of_data:
        list_of_current_values_to_add = []
        for kpi in input_data["fundamental_data"]:
            list_of_current_values_to_add.append((kpi, input_data['fundamental_data_imputed'].get(kpi, None)))
        dict_of_industry_cik_list_values[input_data['industry']][input_data['cik']].append((input_data["is_filing_on_time"], list_of_current_values_to_add))
            
    break
    

In [21]:
dict_of_industry_cik_list_values

defaultdict(<function __main__.<lambda>()>,
            {'DRILLING OIL & GAS WELLS': defaultdict(list,
                         {1537028: [(True,
                            [('roe', -40.12059339607593),
                             ('roa', -0.020523884510998562),
                             ('roce', -0.019348522482754805),
                             ('gross_margin', -0.2763886143506622),
                             ('operating_profit_margin', -0.2763886143506622),
                             ('net_profit_margin', -0.30979442577584504),
                             ('debt_to_equity', 0.2144316862545524),
                             ('equity', 0.823430425373794),
                             ('debt', 0.12314985480391162),
                             ('current', 1.8327534639565404),
                             ('quick', 1.1941036440212416),
                             ('cash', 0.4448513703228957),
                             ('times_interest_earned', -44.744),
                 

In [13]:
from sklearn.impute import KNNImputer

for industry_key, cik_dict_list_values in dict_of_industry_cik_list_values.items():
    list_for_industry_imputer = []
    list_of_all_for_industry = []
    initial_kpi_idx_mappper = {}
    create_imputer_for_industry = False
    for cik_k, on_time, tuple_kpi_value in cik_dict_list_values.items():
        curr_cik_list = [None] * len(tuple_kpi_value)
        # Fill initial kpi idx mapper on first possible data
        if on_time and not initial_kpi_idx_mappper:
            for idx, (kpi, value) in enumerate(tuple_kpi_value):
                initial_kpi_idx_mappper[kpi] = idx
                curr_cik_list[idx] = value
                if not value:
                    create_imputer_for_industry = True

            list_for_industry_imputer.append(curr_cik_list)
            list_of_all_for_industry.append(curr_cik_list)
            
        elif on_time and initial_kpi_idx_mappper:
            for idx, (kpi, value) in enumerate(tuple_kpi_value):
                curr_cik_list[idx] = value
                if not value:
                    create_imputer_for_industry = True

            list_for_industry_imputer.append(curr_cik_list)
            list_of_all_for_industry.append(curr_cik_list)
            
        elif not on_time:
            for idx, (kpi, value) in enumerate(tuple_kpi_value):
                curr_cik_list[idx] = value
                if not value:
                    create_imputer_for_industry = True
            list_of_all_for_industry.append(curr_cik_list)
        
    # If there is None, create KNNImputer with the following list
    if create_imputer_for_industry:
        curr_imputer = KNNImputer(n_neighbors=1)
        curr_imputer.fit(list_for_industry_imputer)

        for list_of_values_for_cik in list_of_all_for_industry:
            if None in list_of_values_for_cik:
                imputed_list_of_values_for_cik = curr_imputer.transform(list_of_values_for_cik).tolist()
                
                
            

## Get the fundamental data avg dict for all years and all quarters

In [2]:
dict_fund_data_avg_per_year_q = {}
for year_q, input_data_list in dict_of_res.items():
    dict_fund_data_avg_per_year_q[year_q] = input_data_list[0]["fundamental_data_avg"]

In [3]:
import pandas as pd

list_of_dicts_for_df = []
for year_q, dict_info in dict_fund_data_avg_per_year_q.items():
    # print(year_q)
    for filer_k, dict_kpis in dict_info.items():
        for kpi_k, dict_counts in dict_kpis.items():
            dict_row = {}
            # if kpi_k == 'roe':
            #     print(filer_k.upper())
            # print(f"{kpi_k.upper()} | all: {dict_counts['count_all']} | used: {dict_counts['count_used']} | median: {dict_counts['median']} | mean: {dict_counts['mean']} | not on time: {dict_counts.get('count_not_on_time', 0)} | used to all: {dict_counts['count_used']/dict_counts['count_all']}")
            # print(f"{kpi_k.upper()} | used to all: {round(dict_counts['count_used']/dict_counts['count_all'], 2)}")
            
            list_of_dicts_for_df.append({"year_q": year_q, "filer_type": filer_k, "kpi": kpi_k, "used_to_all": round(dict_counts['count_used']/dict_counts['count_all'], 2)})
            
    #     print('-----------------------------------')
    # print('-----------NEW ITERATION-----------')
    
df_for_plot = pd.DataFrame(list_of_dicts_for_df)

In [8]:
df_for_plot[(df_for_plot["year_q"] == "2019_3") & (df_for_plot["filer_type"] == "non_accelerated_filer")]

Unnamed: 0,year_q,filer_type,kpi,used_to_all
551,2019_3,non_accelerated_filer,roe,0.8
552,2019_3,non_accelerated_filer,roa,0.8
553,2019_3,non_accelerated_filer,roce,0.7
554,2019_3,non_accelerated_filer,gross_margin,0.8
555,2019_3,non_accelerated_filer,operating_profit_margin,0.8
556,2019_3,non_accelerated_filer,net_profit_margin,0.8
557,2019_3,non_accelerated_filer,debt_to_equity,0.7
558,2019_3,non_accelerated_filer,equity,0.8
559,2019_3,non_accelerated_filer,debt,0.6
560,2019_3,non_accelerated_filer,current,0.7


In [33]:
import plotly.express as px

fig = px.histogram(df_for_plot, x="used_to_all", color="kpi", marginal="rug",
                   hover_data=df_for_plot.columns, title="Percentage used KPIs from all for calculating mean/median | color: KPI")
fig.show()

In [35]:
import plotly.express as px

fig = px.histogram(df_for_plot, x="used_to_all", color="year_q", marginal="rug",
                   hover_data=df_for_plot.columns, title="Percentage used KPIs from all for calculating mean/median | color: year Q")
fig.show()

In [36]:
import plotly.express as px

fig = px.histogram(df_for_plot, x="used_to_all", color="filer_type", marginal="rug",
                   hover_data=df_for_plot.columns, title="Percentage used KPIs from all for calculating mean/median | color: Filer type")
fig.show()