# Import Modules

In [18]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


# Connect to Local Database

In [19]:
# Create connection to local duckdb database
con = duckdb.connect('../db/duck.db')
%sql con --alias duckdb

# Fix NaN and Inf Values

Features with NaNs

In [None]:
%%sql
with count_nan as (
    select id, count(*) as count_nan_rows from features_raw where isnan(value) = true group by id
),
count_all as (
    select id, count(*) as count_all_rows from features_raw group by id
)
select *, round(count_nan_rows/count_all_rows, 4) as percentage_nan from count_nan join count_all using(id) order by count_nan desc;

id,count_nan_rows,count_all_rows,percentage_nan
wr,17770,19777042,0.0009


Features with Infs

In [None]:
%%sql
with count_inf as (
    select id, count(*) as count_inf_rows from features_raw where isinf(value) = true group by id
),
count_all as (
    select id, count(*) as count_all_rows from features_raw group by id
)
select *, round(count_inf_rows/count_all_rows, 4) as percentage_inf from count_inf join count_all using(id) order by count_inf desc;

id,count_inf_rows,count_all_rows,percentage_inf
volume_adi,314481,19312694,0.0163
volatility_kcp,5250,19772677,0.0003
trend_vortex_ind_pos,13,19613197,0.0
trend_vortex_ind_neg,59,19613243,0.0
momentum_wr,4,19624666,0.0
momentum_uo,1,19449230,0.0
momentum_stoch,4,19624666,0.0


Calculate the median for each feature ID

In [None]:
%%sql
select
    id, 
    median(value) as median
from features_raw
where isnan(value) = false and isinf(value) = false
group by id

id,median
volume_em,0.0
volatility_bbhi,0.0
momentum_12m,0.0168195718654433
trend_ichimoku_base,25.295
trend_sma_slow,25.275769230769228
momentum_ao,0.0251764705882351
volume_vwap,25.231799843847995
others_cr,4.290617848970246
log_price,1.401228167498113
trend_visual_ichimoku_a,25.315


Replace NaNs and Infs with the median for the corresponding feature ID

In [None]:
%%sql
with median_values as (
    select 
        id, 
        median(value) as median
    from features_raw 
    where isnan(value) = false and isinf(value) = false group by id
)
select 
    date,
    symbol,
    id,
    case
        when isnan(value) = true then median
        when isinf(value) = true then median
        else value
    end as value
from features_raw 
join median_values using(id)

date,symbol,id,value
2023-04-06,GSEW,volatility_bbhi,0.0
2023-04-11,GSEW,trend_sma_slow,58.77923076923078
2023-04-11,GSEW,momentum_roc,4.262692774215909
2023-04-21,GSEW,volatility_kch,60.687333333333335
2023-04-24,GSEW,momentum_stoch,67.7018633540372
2023-04-25,GSEW,momentum_stoch,0.0
2023-04-28,GSEW,volatility_dcp,0.6410256410256401
2023-05-02,GSEW,high,59.71
2023-05-04,GSEW,trend_kst_diff,-8.518817136667053
2023-05-05,GSEW,volatility_kcp,0.4192580469176245


# Create Table

In [None]:
con.sql("""
create or replace table features_cleaned as (
    with median_values as (
    select 
        id, 
        median(value) as median
    from features_raw 
    where isnan(value) = false and isinf(value) = false group by id
    )
    select 
        date,
        symbol,
        id,
        case
            when isnan(value) = true then median
            when isinf(value) = true then median
            else value
        end as value
    from features 
    join median_values using(id)
)
""")

# Data Checks

Check row counts

In [None]:
%%sql
select count(*) from features_cleaned
union all
select count(*) from features_raw

count_star()
2299537684
2299537684


Check for NaNs

In [9]:
%%sql
select id, count(*) as count_nan_rows from features_cleaned where isnan(value) = true group by id


id,count_nan_rows


Check for Infs

In [10]:
%%sql
select id, count(*) as count_inf_rows from features_cleaned where isinf(value) = true group by id

id,count_inf_rows


# Close Database Connection

In [17]:
con.close()