# Import Modules

In [1]:
import duckdb
import pandas as pd
%load_ext sql

# Connect to Local Database

In [2]:

# Create connection to local duckdb database
con = duckdb.connect('../db/duck.db')
%sql con --alias duckdb

# Create Features

In [None]:
query ="""
CREATE OR REPLACE TEMP TABLE tmp_feat_technicals AS (
    WITH price AS (
        SELECT
            *,
        FROM pricing
        order by act_symbol, date
    ),
    technicals AS (
        SELECT
            date_trunc('day', date) AS date, 
            act_symbol,
            volume,

            -- Price Momentum Features
            (close - LAG(close, 21) OVER (PARTITION BY act_symbol ORDER BY date ASC)) / LAG(close, 21) OVER (PARTITION BY act_symbol ORDER BY date ASC) AS momentum_1m,
            (close - LAG(close, 63) OVER (PARTITION BY act_symbol ORDER BY date ASC)) / LAG(close, 63) OVER (PARTITION BY act_symbol ORDER BY date ASC) AS momentum_3m,
            (close - LAG(close, 126) OVER (PARTITION BY act_symbol ORDER BY date ASC)) / LAG(close, 126) OVER (PARTITION BY act_symbol ORDER BY date ASC) AS momentum_6m,
            (close - LAG(close, 189) OVER (PARTITION BY act_symbol ORDER BY date ASC)) / LAG(close, 189) OVER (PARTITION BY act_symbol ORDER BY date ASC) AS momentum_9m,
            (close - LAG(close, 252) OVER (PARTITION BY act_symbol ORDER BY date ASC))  / LAG(close, 252) OVER (PARTITION BY act_symbol ORDER BY date ASC) AS momentum_12m,
            momentum_12m - momentum_1m as momentum_12m_1m,
            (close - LAG(close, 1260) OVER (PARTITION BY act_symbol ORDER BY date)) / LAG(close, 1260) OVER (PARTITION BY act_symbol ORDER BY date) AS momentum_5y,
            (close - LAG(close, 2520) OVER (PARTITION BY act_symbol ORDER BY date)) / LAG(close, 2520) OVER (PARTITION BY act_symbol ORDER BY date) AS momentum_10y,

            -- Volatility of volume/price
            STDDEV(LOG(volume / close)) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '364' DAY PRECEDING AND CURRENT ROW) AS vol_vo_p_12m,
            STDDEV(LOG(volume / close)) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '29' DAY PRECEDING AND CURRENT ROW) AS vol_vo_p_1m,

            -- Volume Price Trend (VPT)
            (close / LAG(close, 1) OVER (PARTITION BY act_symbol ORDER BY date) - 1) * volume AS daily_component,

            -- Williams %R
            (MAX(close) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '21' DAY PRECEDING AND CURRENT ROW) - close) /
            (MAX(close) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '21' DAY PRECEDING AND CURRENT ROW) -
            MIN(close) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '21' DAY PRECEDING AND CURRENT ROW)) AS wr, 
        
            -- Volume Indicators
            avg("volume") OVER (PARTITION BY "act_symbol" ORDER BY "date" ASC RANGE BETWEEN INTERVAL 29 DAYS PRECEDING AND CURRENT ROW) as adtv_30d,

            -- Lottery
            (close - LAG(close, 1) OVER (PARTITION BY act_symbol ORDER BY date)) / LAG(close, 1) OVER (PARTITION BY act_symbol ORDER BY date) AS daily_return,

            -- Amihud illiquidity
            ABS((close - LAG(close, 1) OVER (PARTITION BY act_symbol ORDER BY date)) / LAG(close, 1) OVER (PARTITION BY act_symbol ORDER BY date)) * 1e9 / (close * volume) AS amihud_component,
            
            -- Price to 52w High/Low 
            close / NULLIF(MAX(high) OVER (PARTITION BY act_symbol ORDER BY date asc RANGE BETWEEN INTERVAL '364' DAY PRECEDING AND CURRENT ROW), 0) AS price_to_52w_high,
            close / NULLIF(MAX(close) OVER (PARTITION BY act_symbol ORDER BY date asc RANGE BETWEEN INTERVAL '364' DAY PRECEDING AND CURRENT ROW), 0) AS price_to_52w_close_high,
            close / NULLIF(MIN(low) OVER (PARTITION BY act_symbol ORDER BY date asc RANGE BETWEEN INTERVAL '364' DAY PRECEDING AND CURRENT ROW), 0) AS price_to_52w_low,
            close / NULLIF(MIN(close) OVER (PARTITION BY act_symbol ORDER BY date asc RANGE BETWEEN INTERVAL '364' DAY PRECEDING AND CURRENT ROW), 0) AS price_to_52w_close_low,

            -- Log Indicators
            log(close) as log_price,
            log(volume) as log_volume,
            
            -- Date Indicators
            dayofweek(date) as dayofweek,
            dayofmonth(date) as dayofmonth,
            dayofyear(date) as dayofyear,
            month(date) as month
        FROM price
    ),
    lottery AS (
        SELECT
            *,
            MAX(daily_return) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '30' DAY PRECEDING AND CURRENT ROW) AS lottery
        FROM technicals
    ),
    amihud AS (
        SELECT
            *,
            AVG(amihud_component) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '30' DAY PRECEDING AND CURRENT ROW) AS amihud
        FROM lottery
    ),
    vpt1 AS (
        SELECT
            *,
            SUM(daily_component) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '21' DAY PRECEDING AND CURRENT ROW) AS three_week_sum,
            AVG(volume) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '21' DAY PRECEDING AND CURRENT ROW) AS avg_volume_3weeks,
        FROM amihud
    ),
    vpt2 AS (
        SELECT
            *,
            AVG(three_week_sum) OVER (PARTITION BY act_symbol ORDER BY date RANGE BETWEEN INTERVAL '14' DAY PRECEDING AND CURRENT ROW) AS ema_approx
        FROM vpt1
    ),
    vpt3 AS (
        SELECT
            *,
            ema_approx / avg_volume_3weeks AS vpt
        FROM vpt2
    )
    select * exclude (volume, daily_return, amihud_component, daily_component, three_week_sum, avg_volume_3weeks, ema_approx) 
    from vpt3
);
"""

con.sql(query)

In [5]:
con.sql("select * from tmp_feat_technicals")

┌────────────┬────────────┬───────────────────────┬──────────────────────┬──────────────────────┬───────────────────────┬──────────────────────┬─────────────────────┬────────────────────┬──────────────┬─────────────────────┬─────────────────────┬────────────────────┬────────────────────┬────────────────────┬─────────────────────────┬────────────────────┬────────────────────────┬────────────────────┬────────────────────┬───────────┬────────────┬───────────┬───────┬──────────────────────┬────────────────────┬───────────────────────┐
│    date    │ act_symbol │      momentum_1m      │     momentum_3m      │     momentum_6m      │      momentum_9m      │     momentum_12m     │   momentum_12m_1m   │    momentum_5y     │ momentum_10y │    vol_vo_p_12m     │     vol_vo_p_1m     │         wr         │      adtv_30d      │ price_to_52w_high  │ price_to_52w_close_high │  price_to_52w_low  │ price_to_52w_close_low │     log_price      │     log_volume     │ dayofweek │ dayofmonth │ dayofyear │ mo

Unpivot momentum features (horiz -> vert) and insert into table

In [7]:
con.sql("""
CREATE OR REPLACE TABLE features_v_mo AS (
    with cte as (
        UNPIVOT tmp_feat_technicals
        ON COLUMNS(* EXCLUDE (date, act_symbol))
        INTO
            NAME id
            VALUE value
    )
    select 
        date_trunc('day',date) as date, 
        act_symbol as symbol,
        id,
        value
    from cte
)
"""
)


Merge into features table

In [None]:
con.sql("""
INSERT INTO features_raw
    SELECT * from features_v_mo
""")

Create features table

In [None]:
con.sql("""
CREATE OR REPLACE TABLE features_raw AS (
    select * from features_v_ta 
    union
    select * from features_v_mo
)
""")

# Close Database Connection

In [11]:
con.close()