# Import Modules

In [14]:
import duckdb
import pandas as pd
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


# Connect to Local Database

In [15]:

# Create connection to local duckdb database
con = duckdb.connect('../db/duck.db')
%sql con --alias duckdb

In [3]:
con.sql(" select name from (show all tables) where name like 'feature%' or name like '%tmp%' ")

┌────────────────────────┐
│          name          │
│        varchar         │
├────────────────────────┤
│ features_cleaned       │
│ features_final         │
│ features_iqr_id        │
│ features_normalized    │
│ features_raw           │
│ features_selected      │
│ features_v_mo          │
│ features_v_res         │
│ features_v_ta          │
│ features_winsorized_id │
│ tmp_feat_ta_lib        │
│ tmp_feat_technicals    │
├────────────────────────┤
│        12 rows         │
└────────────────────────┘

# Winsorization

99-1 percentile values per feature

In [4]:
con.sql("""
SELECT 
    id,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY value) AS p1, 
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY value) AS p99  
FROM features_cleaned
GROUP BY id
""")

┌───────────────────────────┬──────────────────────┬────────────────────┐
│            id             │          p1          │        p99         │
│          varchar          │        double        │       double       │
├───────────────────────────┼──────────────────────┼────────────────────┤
│ trend_kst_sig             │   -319.4090756687673 │ 448.44804289257803 │
│ momentum_pvo_signal       │    -45.4794400972397 │  31.41600886408869 │
│ momentum_kama             │   0.7605025855930496 │  698.7334517042663 │
│ trend_psar_down_indicator │                  0.0 │                1.0 │
│ volume_mfi                │    9.857041832342173 │  92.96242121857581 │
│ volatility_bbl            │    0.550716019084631 │  611.1072691939922 │
│ momentum_ppo_signal       │  -12.368686195013446 │  8.068686328965741 │
│ price_to_52w_close_high   │  0.11311529716150169 │                1.0 │
│ dayofmonth                │                  1.0 │               31.0 │
│ amihud                    │ 0.009717

99-1 percentile values per feature by date

In [5]:
con.sql("""
SELECT 
    date,
    id,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY value) AS p1, 
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY value) AS p99  
FROM features_cleaned
where date = '2024-03-18'
GROUP BY id, date
""")

┌────────────┬──────────────────┬─────────────────────┬────────────────────┐
│    date    │        id        │         p1          │        p99         │
│    date    │     varchar      │       double        │       double       │
├────────────┼──────────────────┼─────────────────────┼────────────────────┤
│ 2024-03-18 │ trend_aroon_up   │                 0.0 │              100.0 │
│ 2024-03-18 │ volatility_bbl   │  0.3335671390145624 │  397.7753967821445 │
│ 2024-03-18 │ momentum_1m      │ -0.4378862359550562 │ 0.5568725536992857 │
│ 2024-03-18 │ dayofyear        │                78.0 │               78.0 │
│ 2024-03-18 │ volatility_bbli  │                 0.0 │                1.0 │
│ 2024-03-18 │ fwd_return_3m    │ -0.6624252720252719 │ 0.8029753581661885 │
│ 2024-03-18 │ momentum_kama    │  0.5375052489974704 │  410.4634556386863 │
│ 2024-03-18 │ volatility_bbm   │            0.566075 │ 412.14152499999994 │
│ 2024-03-18 │ volatility_bbhi  │                 0.0 │                1.0 │

Create table: winsorize by id

In [None]:
con.sql("""
CREATE TABLE features_winsorized_id AS
WITH feature_percentiles AS (
    SELECT 
        id,
        PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY value) AS p1, 
        PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY value) AS p99  
    FROM features_cleaned
    GROUP BY id
)
SELECT 
    f.date,
    f.symbol,
    f.id,
    CASE 
        WHEN f.value < fp.p1 THEN fp.p1
        WHEN f.value > fp.p99 THEN fp.p99
        ELSE f.value
    END AS value
FROM features_cleaned f
JOIN feature_percentiles fp ON f.id = fp.id;
""")

# IQR * 1.5

In [4]:
con.sql("""
CREATE TABLE features_iqr_id AS
WITH feature_iqr AS (
    SELECT
        id,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY value) AS q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY value) AS q3,
        (PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY value) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY value)) * 1.5 AS iqr_1_5,
    FROM features_cleaned
    GROUP BY id
)
SELECT
    f.date,
    f.symbol,
    f.id,
    value
FROM features_cleaned f
JOIN feature_iqr fi ON f.id = fi.id
WHERE f.value >= (fi.q1 - fi.iqr_1_5) AND f.value <= (fi.q3 + fi.iqr_1_5);
""")

Create table: winsorize by date & id

In [6]:
con.sql("""
CREATE TABLE features_winsorized_id_date AS
WITH feature_percentiles AS (
    SELECT
        date,
        id,
        PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY value) AS p1, 
        PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY value) AS p99  
    FROM features_cleaned
    GROUP BY date, id
)
SELECT 
    f.date,
    f.symbol,
    f.id,
    CASE 
        WHEN f.value < fp.p1 THEN fp.p1
        WHEN f.value > fp.p99 THEN fp.p99
        ELSE f.value
    END AS value
FROM features_cleaned f
JOIN feature_percentiles fp ON f.id = fp.id AND f.date = fp.date;
;
""")

# Z-Score Normalization by Date and ID (Mean and Standard Deviation)

In [None]:
con.sql("""
CREATE OR REPLACE TABLE features_normalized2 AS
WITH feature_stats AS (
    SELECT
        date,
        id,
        AVG(value) AS mean,
        STDDEV_POP(value) AS stddev
    FROM features_winsorized_id_date
    GROUP BY date, id
)
SELECT
    f.date,
    f.symbol,
    f.id,
    case
        when f.id in ('dayofweek', 'dayofmonth', 'month', 'dayofyear', 'trend_psar_up_indicator', 'trend_psar_down_indicator') then f.value
        when fs.stddev = 0 then 0
        else (f.value - fs.mean) / fs.stddev
    end AS value
FROM features_winsorized_id_date f
JOIN feature_stats fs ON f.date = fs.date AND f.id = fs.id;
""")

# Z-Score Normalization by Date and ID (Median and IQR)

In [None]:
# TODO
con.sql("""
WITH feature_stats AS (
    -- Calculate the median and IQR for each feature by date
    SELECT 
        date,
        id AS feature_id,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY value) AS median_value,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY value) 
            - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY value) AS iqr_value
    FROM features
    GROUP BY date, id
),
normalized_features AS (
    -- Normalize each feature value using median and IQR
    SELECT 
        f.date,
        f.symbol,
        f.id AS feature_id,
        (f.value - s.median_value) / NULLIF(s.iqr_value, 0) AS normalized_value
    FROM features f
    JOIN feature_stats s 
        ON f.date = s.date AND f.id = s.feature_id
)
-- Select the final normalized dataset
SELECT * FROM normalized_features;

"""))

# Distribution of Features

In [None]:
columns = con.execute("SELECT distinct id FROM features_winsorized_id order by id desc").fetchall()
columns = [col[0] for col in columns]
columns

['adtv_30d',
 'amihud',
 'dayofmonth',
 'dayofweek',
 'dayofyear',
 'fwd_return_1m',
 'fwd_return_1w',
 'fwd_return_3m',
 'fwd_return_6m',
 'log_price',
 'log_volume',
 'lottery',
 'momentum_10y',
 'momentum_12m',
 'momentum_12m_1m',
 'momentum_1m',
 'momentum_3m',
 'momentum_5y',
 'momentum_6m',
 'momentum_9m',
 'momentum_ao',
 'momentum_kama',
 'momentum_ppo',
 'momentum_ppo_hist',
 'momentum_ppo_signal',
 'momentum_pvo',
 'momentum_pvo_hist',
 'momentum_pvo_signal',
 'momentum_roc',
 'momentum_rsi',
 'momentum_stoch',
 'momentum_stoch_rsi',
 'momentum_stoch_rsi_d',
 'momentum_stoch_rsi_k',
 'momentum_stoch_signal',
 'momentum_tsi',
 'momentum_uo',
 'momentum_wr',
 'month',
 'others_cr',
 'others_dlr',
 'others_dr',
 'price_to_52w_close_high',
 'price_to_52w_close_low',
 'price_to_52w_high',
 'price_to_52w_low',
 'trend_adx',
 'trend_adx_neg',
 'trend_adx_pos',
 'trend_aroon_down',
 'trend_aroon_ind',
 'trend_aroon_up',
 'trend_cci',
 'trend_dpo',
 'trend_ema_fast',
 'trend_ema_slow'

In [4]:
import duckdb
import matplotlib.pyplot as plt
import pandas as pd
import os

# Create the img directory if it doesn't exist
if not os.path.exists('./img'):
    os.makedirs('./img')

# Query to get all column names
columns = con.execute("SELECT distinct id FROM features_winsorized_id order by id desc").fetchall()
columns = [col[0] for col in columns]

# Plot distributions
import numpy as np

for col in columns:
    df = con.execute(f"SELECT value FROM features_winsorized_id where id = '{col}'").fetchdf()
    data = df['value']

    # Calculate the IQR (Interquartile Range)
    iqr = np.percentile(data, 75) - np.percentile(data, 25)

    # Calculate the bin width using the Freedman-Diaconis rule
    bin_width = 2 * iqr / (len(data) ** (1/3))

    # Calculate the number of bins
    num_bins = int((np.max(data) - np.min(data)) / bin_width)

    # Plot the histogram
    plt.figure(figsize=(8, 4))
    plt.hist(data, bins=num_bins, edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.savefig(f'./img/{col}_distribution.png')
    plt.close()


  num_bins = int((np.max(data) - np.min(data)) / bin_width)


OverflowError: cannot convert float infinity to integer

# Close Database Connection

In [17]:
con.close()