# Import Modules

In [1]:
import duckdb
import pandas as pd
from ta import add_all_ta_features
from ta.utils import dropna
%load_ext sql

# Connect to Local Database

In [2]:

# Create connection to local duckdb database
con = duckdb.connect('../db/duck.db')
%sql con --alias duckdb

# Create Features: Single Example

In [None]:
con.sql("select * FROM pricing where act_symbol = 'NVDA' order by date")

In [5]:
# Step 1: Query the DuckDB table into a Pandas DataFrame
query = "SELECT * FROM pricing where act_symbol = 'NVDA' order by date"
df = con.sql(query).fetchdf()
df

Unnamed: 0,date,act_symbol,open,high,low,close,volume
0,2011-01-03,NVDA,0.38800,0.39925,0.38750,0.39550,8.175126e+08
1,2011-01-04,NVDA,0.39625,0.39800,0.38550,0.39425,6.514246e+08
2,2011-01-05,NVDA,0.40000,0.42500,0.39750,0.42450,1.428213e+09
3,2011-01-06,NVDA,0.43500,0.48350,0.43425,0.48325,3.493309e+09
4,2011-01-07,NVDA,0.47750,0.49825,0.46700,0.49675,2.580377e+09
...,...,...,...,...,...,...,...
3659,2025-01-10,NVDA,137.45000,139.92000,134.22000,135.91000,2.042641e+08
3660,2025-01-13,NVDA,129.99000,133.49000,129.51000,133.23000,2.010247e+08
3661,2025-01-14,NVDA,136.05000,136.38000,130.05000,131.76000,1.922436e+08
3662,2025-01-15,NVDA,133.65000,136.45000,131.29000,136.24000,1.817240e+08


In [25]:
# deprecated because pricing table is already cleaned
# # show rows in df but not in dropna(df)
# df[~df.index.isin(dropna(df).index)]

# # Clean NaN values
# df = dropna(df)

In [6]:
# Add ta features filling NaN values
df_feat_single = add_all_ta_features(
    df, open="open", high="high", low="low", close="close", volume="volume", fillna=False)

# convert date from timestamp to short date
df_feat_single['date'] = pd.to_datetime(df_feat_single['date'])

# drop columns: open, high, low, close, volume
df_feat_single = df_feat_single.drop(['open', 'high', 'low', 'close', 'volume'], axis=1)

df_feat_single

Unnamed: 0,date,act_symbol,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,volume_sma_em,volume_vpt,volume_vwap,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
0,2011-01-03,NVDA,2.956961e+08,8.175126e+08,,,,,,,...,,,,,,,,,,0.000000
1,2011-01-04,NVDA,5.562659e+08,1.660881e+08,,,-0.000003,,-2.058864e+06,,...,,,,,,,,-0.316056,-0.316556,-0.316056
2,2011-01-05,NVDA,1.932543e+09,1.594301e+09,,,0.000038,,1.075250e+08,,...,,,,,,,,7.672796,7.392678,7.332491
3,2011-01-06,NVDA,5.390388e+09,5.087610e+09,,,0.000067,,5.909924e+08,,...,,,,,,,,13.839812,12.962211,22.187105
4,2011-01-07,NVDA,7.723049e+09,7.667987e+09,,,0.000029,,6.630774e+08,,...,,,,,,,,2.793585,2.755276,25.600506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3659,2025-01-10,NVDA,8.020879e+10,1.691969e+11,-0.064124,-1.960743e+08,-10.283011,-1.060271,5.453850e+09,140.549006,...,0.382349,0.280138,0.102211,4.145783,0.606459,3.539324,138.114431,-2.997645,-3.043493,34264.096081
3660,2025-01-13,NVDA,8.038355e+10,1.689959e+11,-0.009488,-2.450274e+08,-11.027800,-2.477331,5.449886e+09,139.937854,...,0.007934,0.225697,-0.217763,3.274851,1.140137,2.134713,138.020354,-1.971893,-1.991594,33586.472819
3661,2025-01-14,NVDA,8.029517e+10,1.688036e+11,-0.063105,-2.503946e+08,5.646976,-2.073975,5.447765e+09,139.435660,...,-0.372472,0.106063,-0.478535,2.220783,1.356266,0.864517,137.943292,-1.103355,-1.109487,33214.791403
3662,2025-01-15,NVDA,8.046210e+10,1.689854e+11,0.036810,-9.832059e+07,1.859853,-1.748892,5.453943e+09,139.132983,...,-0.407339,0.003383,-0.410722,0.956553,1.276324,-0.319771,137.927593,3.400121,3.343595,34347.534766


Convert to vertical

In [10]:
%%sql
with cte as (
    UNPIVOT df_feat_single
    ON COLUMNS(* EXCLUDE (date, act_symbol))
    INTO
        NAME id
        VALUE value
)
select 
    date_trunc('day',date) as date, 
    act_symbol as symbol,
    id,
    value
from cte


date,symbol,id,value
2011-01-03,NVDA,volume_adi,295696061.27659315
2011-01-03,NVDA,volume_obv,817512640.0
2011-01-03,NVDA,volume_nvi,1000.0
2011-01-03,NVDA,volatility_bbhi,0.0
2011-01-03,NVDA,volatility_bbli,0.0
2011-01-03,NVDA,volatility_kch,0.4058333333333334
2011-01-03,NVDA,volatility_kcl,0.3823333333333333
2011-01-03,NVDA,volatility_kcp,0.5602836879432603
2011-01-03,NVDA,volatility_kchi,0.0
2011-01-03,NVDA,volatility_kcli,0.0


Merge into features table

In [11]:
%%sql
WITH cte AS (
    UNPIVOT df_feat_single
    ON COLUMNS(* EXCLUDE (date, act_symbol))
    INTO
        NAME id
        VALUE value
)
INSERT INTO features (date, id, symbol, value)
SELECT 
    date_trunc('day', date) AS date,  
    id, 
    act_symbol AS symbol,
    value
FROM cte
ON CONFLICT (date, symbol, id) DO UPDATE SET
    value = EXCLUDED.value;


Count
310105


# Create Features: Bulk

In [None]:
# SQL query to get all rows from the pricing04_resampled table
query = """
SELECT * FROM pricing 
order by act_symbol, date
"""

# Query the DuckDB table into a Pandas DataFrame
df = con.sql(query).fetchdf()

def process_symbol_group(group):
    
    return add_all_ta_features(
        group,
        open="open",
        high="high",
        low="low",
        close="close",
        volume="volume",
        fillna=False
    )

# Apply the function only to groups with at least 50 rows
df_feat = (
    df.groupby("act_symbol", group_keys=False)
    .apply(process_symbol_group, include_groups=True) 
    .reset_index(drop=True)
)

# Display the processed DataFrame
df_feat.head()

# save df_feat to csv
df_feat.to_csv('df_feat.csv', index=False)



  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return b

Store TA features in table

In [None]:
# Save the processed DataFrame as a table in DuckDB
con.sql("CREATE OR REPLACE TEMP TABLE tmp_feat_ta_lib AS SELECT * FROM df_feat")

In [None]:
# con.sql("CREATE INDEX idx_date ON features(date)");
# con.sql("CREATE INDEX idx_symbol ON features(symbol)");
# con.sql("CREATE INDEX idx_id ON features(id)");

Unpivot TA features (horiz -> vert) and insert into table

In [None]:
con.sql("""
CREATE OR REPLACE TABLE features_v_ta AS (
    with cte as (
    UNPIVOT tmp_feat_ta_lib
    ON COLUMNS(* EXCLUDE (date, act_symbol))
    INTO
        NAME id
        VALUE value
    )
    select 
        date_trunc('day',date) as date, 
        act_symbol as symbol,
        id,
        value
        --value::DECIMAL(10, 4) as value
    from cte
    where id not in ('open', 'high', 'low', 'close', 'volume')
)
""")

Merge into features table

In [None]:
con.sql("""
INSERT INTO features_raw
    SELECT * from features_v_ta
""")


NOT WORKING:
Upsert into features table

In [None]:
%%sql
WITH cte AS (
    UNPIVOT tmp_feat_ta_lib
    ON COLUMNS(* EXCLUDE (date, act_symbol))
    INTO
        NAME id
        VALUE value
)
INSERT INTO features (date, id, symbol, value)
SELECT 
    date_trunc('day', date) AS date,  
    id, 
    act_symbol AS symbol,
    value
FROM cte
ON CONFLICT (date, symbol, id) DO UPDATE SET
    value = EXCLUDED.value;


OutOfMemoryException: Out of Memory Error: could not allocate block of size 256.0 KiB (51.1 GiB/51.1 GiB used)

Unique symbols

In [None]:
# count unique symbols
con.sql("with cte as ( select distinct symbol from features_raw ) select count(*) from cte")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        11547 │
└──────────────┘

Unique TA features

In [None]:
con.sql("with cte as ( select distinct id from features_raw ) select count(*) from cte")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│           91 │
└──────────────┘

# Close Database Connection

In [4]:
con.close()