# Import Modules

In [1]:
%load_ext sql

In [2]:
%config SqlMagic.displaylimit = 50

In [3]:
import duckdb
import polars as pl
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib 

# Connect to Local Database

In [4]:

# Create connection to local duckdb database
con = duckdb.connect('../db/duck.db')
%sql con --alias duckdb

# Load Model

In [5]:
# Load model
model_loaded = joblib.load("model_final1.pkl")

In [6]:
# Ensure it's fitted
if not hasattr(model_loaded, "feature_importances_"):
    raise ValueError("Model is not fitted. Ensure you loaded the correct model.")

# Backtest

In [7]:
# Step 1: Load Data from DuckDB
start_date = '2024-01-01'
end_date = '2024-01-01'
query = "SELECT * FROM features_final WHERE date BETWEEN ? AND ?"
df = con.execute(query, [start_date, end_date]).fetchdf()
    
df

Unnamed: 0,date,symbol,adtv_30d,amihud,fwd_return_3m,log_volume,lottery,momentum_10y,momentum_12m,momentum_12m_1m,...,price_to_52w_high,vol_vo_p_12m,volatility_bbw,volatility_dcw,volatility_kcw,volatility_ui,volume_adi,volume_nvi,volume_obv,volume_vpt
0,2024-01-01,NEOV,-0.391740,0.418367,-1.248475,-0.635041,0.116286,,-0.937286,-0.596747,...,-1.664681,0.102616,0.290209,0.809562,1.482162,0.947509,-0.218304,-0.578907,-0.418923,-0.270512
1,2024-01-01,ZTEK,-0.396434,1.224232,0.516254,-0.444988,0.230973,,-0.635664,-0.195280,...,-0.727692,0.052033,-0.212964,0.567094,1.007433,1.120739,-0.227693,-0.452275,-0.428160,-0.275635
2,2024-01-01,MLAB,-0.381657,-0.258114,0.083046,-0.365434,0.018362,-0.235769,-1.042500,-1.333866,...,-1.093566,-0.704015,-0.052110,0.420954,0.097023,0.628883,-0.209371,0.058751,-0.410130,-0.275762
3,2024-01-01,AVDE,-0.297523,-0.261826,0.114986,0.049658,-0.648199,,0.121809,0.184980,...,0.837750,-0.871159,-0.683002,-0.732420,-0.748378,-0.543094,-0.133743,0.098885,-0.418289,-0.277138
4,2024-01-01,EEMS,-0.387685,-0.257295,-0.043091,0.052033,-0.639424,-0.267030,0.270560,0.400822,...,0.800540,0.158409,-0.773943,-0.716705,-0.776195,-0.341425,-0.219224,0.770908,-0.404816,-0.278529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7163,2024-01-01,IMPP,-0.060585,-0.241275,0.677227,0.962893,1.023570,,-0.823268,-1.992487,...,-1.270159,0.071573,2.853921,1.913339,1.635103,-0.107444,-0.567669,-0.695129,-0.032346,1.594709
7164,2024-01-01,NR,-0.006563,-0.259281,0.254976,0.953286,-0.369390,-0.547271,0.775584,1.123474,...,0.343625,-0.725477,-0.343700,-0.174039,-0.157155,0.260823,-0.253229,-0.623262,0.390144,-0.091145
7165,2024-01-01,ITRI,-0.275809,-0.261776,0.847672,0.251768,-0.257367,0.080574,0.664662,0.602512,...,0.635960,-0.870535,-0.236788,-0.406949,-0.443903,-0.302742,0.015202,-0.142990,-0.277563,-0.296717
7166,2024-01-01,CSTR,-0.354907,-0.255837,0.191472,-0.218170,-0.055178,,0.046516,-0.014765,...,0.709350,-0.135787,-0.278562,-0.353265,-0.501186,-0.402758,-0.203515,-0.388791,-0.392208,-0.272426


In [None]:
def quarterly_backtest(model, con, start_date, end_date):
    """
    Backtest a quarterly rebalancing strategy based on predicted 3M forward returns.

    Parameters:
    - model: Trained model to generate predictions.
    - con: DuckDB connection to fetch data.
    - start_date (str): Backtest start date (YYYY-MM-DD).
    - end_date (str): Backtest end date (YYYY-MM-DD).

    Returns:
    - DataFrame with ordered quarterly portfolio returns and cumulative return.
    - Sharpe Ratio of the strategy.
    """

    # Step 1: Load Data from DuckDB
    query = """
        SELECT a.*, b.value as actual_return, c.value as actual_adtv_30d
        FROM features_final a
            join features_cleaned b on a.symbol = b.symbol and a.date = b.date
            join features_cleaned c on a.symbol = c.symbol and a.date = c.date
        WHERE a.date BETWEEN ? AND ?
        AND b.id = 'fwd_return_3m'
        -- Minimum 30-day average daily volume of 500,000 shares --
        AND c.id = 'adtv_30d' and c.value >= 500000 
        """
    df = con.execute(query, [start_date, end_date]).fetchdf()

    # Step 2: Define Features and Target
    cols_to_remove = ['date', 'symbol', 'fwd_return_3m', 'actual_return', 'actual_adtv_30d']
    features = [col for col in df.columns if col not in cols_to_remove]

    # Step 3: Convert dates to quarterly periods
    df['date'] = pd.to_datetime(df['date'])
    df['quarter'] = df['date'].dt.to_period('Q')  # Convert to quarters

    # Step 4: Get all quarterly rebalancing dates (start of each quarter) and sort them
    quarterly_rebalancing_dates = sorted(df['date'].drop_duplicates().dt.to_period('Q').unique().tolist())

    # Step 5: Backtest loop
    quarterly_returns = []

    for quarter in quarterly_rebalancing_dates:
        # Step 5.1: Get data available **at the start of the quarter**
        rebalance_date = df[df['quarter'] == quarter]['date'].min()
        df_rebalance = df[df['date'] == rebalance_date].copy()

        # Step 5.2: Generate predictions for the quarter
        X_quarter = df_rebalance[features]
        df_rebalance['pred_return_3m'] = model.predict(X_quarter)

        # Step 5.3: Rank stocks and form long-short portfolio
        df_rebalance['rank'] = df_rebalance['pred_return_3m'].rank(pct=True)
        df_rebalance['long'] = df_rebalance['rank'] >= 0.996  # Top 10%
        df_rebalance['short'] = df_rebalance['rank'] <= 0.004 # Bottom 10%

        # Step 5.4: Compute portfolio return using fwd_return_3m
        long_returns = df_rebalance[df_rebalance['long']]['actual_return'].mean()
        short_returns = df_rebalance[df_rebalance['short']]['actual_return'].mean()
        portfolio_return = long_returns #- short_returns

        # Store the return for this quarter
        quarterly_returns.append({'quarter': quarter, 'return': portfolio_return})

        print(f"Quarter: {quarter}, Portfolio Return: {portfolio_return:.2f} | Longs: {long_returns:.2f} | Shorts: {short_returns:.2f} | Universe: {df_rebalance['rank'].count()} | L_Portfolio Size: {df_rebalance[df_rebalance['long']]['actual_return'].count()} | S_Portfolio Size: {df_rebalance[df_rebalance['short']]['actual_return'].count()}")

    # Step 6: Convert to DataFrame and **SORT BY QUARTER**
    portfolio_returns = pd.DataFrame(quarterly_returns).sort_values(by='quarter')

    # Step 7: Compute Cumulative Returns **In Correct Order**
    portfolio_returns['cumulative_return'] = ((1 + portfolio_returns['return']).cumprod() - 1).round(3)*100
    # Convert to rounded decimals (6 decimal places)
    # portfolio_returns['cumulative_return'] = portfolio_returns['cumulative_return'].round(6)

    # Step 8: Compute Sharpe Ratio
    # sharpe_ratio = portfolio_returns['return'].mean() / portfolio_returns['return'].std()
    periods_per_year = 4  # Quarterly rebalancing
    sharpe_ratio = (portfolio_returns['return'].mean() * periods_per_year) / (portfolio_returns['return'].std() * np.sqrt(periods_per_year))

    # Display results
    print(f"Sharpe Ratio: {sharpe_ratio:.2f}")

    return portfolio_returns


# Example Usage:
backtest_results = quarterly_backtest(model_loaded, con, '2024-01-01', '2024-12-31')
backtest_results


Quarter: 2024Q1, Portfolio Return: 0.12 | Longs: 0.12 | Shorts: -0.50 | Universe: 2381 | L_Portfolio Size: 10 | S_Portfolio Size: 9
Quarter: 2024Q2, Portfolio Return: -0.11 | Longs: -0.11 | Shorts: 0.37 | Universe: 2361 | L_Portfolio Size: 10 | S_Portfolio Size: 9
Quarter: 2024Q3, Portfolio Return: 0.24 | Longs: 0.24 | Shorts: 0.33 | Universe: 1912 | L_Portfolio Size: 8 | S_Portfolio Size: 7
Quarter: 2024Q4, Portfolio Return: 0.24 | Longs: 0.24 | Shorts: -0.05 | Universe: 2403 | L_Portfolio Size: 10 | S_Portfolio Size: 9
Sharpe Ratio: 0.74


Unnamed: 0,quarter,return,cumulative_return
0,2024Q1,0.12368,12.4
1,2024Q2,-0.111512,-0.2
2,2024Q3,0.238647,23.7
3,2024Q4,0.240993,53.5


In [22]:
query_test ="""select * from features_final where date = '2024-01-01'"""
# Load data into pandas dataframe
df_test = con.execute(query_test).fetchdf()

# Specify features and target
columns = df_test.columns.tolist()
cols_to_remove = ['date', 'symbol', 'fwd_return_3m']
features = [x for i, x in enumerate(columns) if x not in cols_to_remove] 
target = "fwd_return_3m"

df_test_copy = df_test.copy()

# filter for a single date to predict
# df_test_copy = df_test_copy[df_test_copy['date'] == '2024-06-04']

# Extract features only (exclude 'symbol' and 'date')
X_future = df_test_copy[features]  # Ensure feature set matches training

# Predict 3-month forward returns
predicted_returns = model_loaded.predict(X_future)

# Reattach symbol & date for ranking and analysis
df_test_copy['pred_return_3m'] = predicted_returns

# Display ranked results
ranked_stocks = df_test_copy[['date', 'symbol', 'pred_return_3m', 'fwd_return_3m']].sort_values(by='pred_return_3m', ascending=False)

con.sql("""
with cte as (
    select a.date, a.symbol, a.value as fwd_return_3m, b.pred_return_3m, c.value as adtv_30d
    from features_cleaned a 
    JOIN ranked_stocks b on a.symbol = b.symbol and a.date = b.date
    join features_cleaned c on a.symbol = c.symbol and a.date = c.date
    where a.date = b.date and a.id = 'fwd_return_3m' and c.id = 'adtv_30d' and c.value >= 500000 --1000000
    order by b.pred_return_3m desc
    limit 24
)
select * from cte
--select mean(fwd_return_3m) as avg_fwd_return_3m from cte
""")

┌────────────┬─────────┬──────────────────────┬────────────────┬────────────────────┐
│    date    │ symbol  │    fwd_return_3m     │ pred_return_3m │      adtv_30d      │
│    date    │ varchar │        double        │     float      │       double       │
├────────────┼─────────┼──────────────────────┼────────────────┼────────────────────┤
│ 2024-01-01 │ GORO    │  0.13157894736842102 │      0.8582997 │  700650.6666666666 │
│ 2024-01-01 │ NVDA    │   0.8245628205645973 │     0.53321457 │  365922355.2380952 │
│ 2024-01-01 │ AKBA    │   0.4758064516129033 │      0.5133724 │ 1437664.9523809524 │
│ 2024-01-01 │ SMH     │    0.286612912449248 │     0.42112744 │  6420282.380952381 │
│ 2024-01-01 │ KOLD    │    0.625350067420392 │      0.4187115 │  2791461.523809524 │
│ 2024-01-01 │ ESPR    │  -0.1036789297658863 │      0.4138413 │  6381032.380952381 │
│ 2024-01-01 │ AMZN    │  0.18717914966434118 │     0.40252605 │  45747993.23809524 │
│ 2024-01-01 │ ALLK    │  -0.5384615384615384 │     0.

In [135]:
print(ranked_stocks.head(10)['fwd_return_3m'].mean())
ranked_stocks.head(10)

-0.836387393840484


Unnamed: 0,date,symbol,pred_return_3m,fwd_return_3m
7999,2024-10-16,SLDB,1.157893,-2.023374
647,2024-10-16,JSPR,1.073208,-2.226284
2994,2024-10-16,INMB,0.90385,1.073613
6096,2024-10-16,NKTR,0.881827,-1.139597
5980,2024-10-16,BHAT,0.850366,-2.226284
5540,2024-10-16,AFRM,0.826801,0.618056
136,2024-10-16,VKTX,0.818007,-1.555718
7785,2024-10-16,QRTEA,0.780452,-1.686447
243,2024-10-16,TMF,0.728992,-1.121058
6456,2024-10-16,HOOD,0.724249,1.92322


In [140]:
%%sql
with cte as (
    select a.*, b.pred_return_3m from features_cleaned a 
    JOIN ranked_stocks b on a.symbol = b.symbol and a.date = b.date
    where a.date = b.date and a.id = 'fwd_return_3m'
    order by b.pred_return_3m desc
    limit 10
)
select * from cte
--select mean(value) from cte

date,symbol,id,value,pred_return_3m
2024-10-16,SLDB,fwd_return_3m,-0.5278174037089871,1.157893180847168
2024-10-16,JSPR,fwd_return_3m,-0.632013201320132,1.0732077360153198
2024-10-16,INMB,fwd_return_3m,0.2552783109404991,0.9038504362106324
2024-10-16,NKTR,fwd_return_3m,-0.3043478260869565,0.8818271160125732
2024-10-16,BHAT,fwd_return_3m,-0.7857142857142857,0.8503660559654236
2024-10-16,AFRM,fwd_return_3m,0.140087554721701,0.8268014192581177
2024-10-16,VKTX,fwd_return_3m,-0.4095671981776764,0.8180065751075745
2024-10-16,QRTEA,fwd_return_3m,-0.442622950819672,0.7804520130157471
2024-10-16,TMF,fwd_return_3m,-0.2996601208459214,0.7289919853210449
2024-10-16,HOOD,fwd_return_3m,0.4701076865948757,0.7242491841316223


In [80]:
ranked_stocks.tail(11)

Unnamed: 0,date,symbol,pred_return_3m,fwd_return_3m
4831,2024-03-01,NUTX,-1.4868,-2.612419
4250,2024-03-01,AGRI,-1.563337,-1.207246
5367,2024-03-01,WHLR,-1.56791,-2.212786
3697,2024-03-01,MNTS,-1.600049,-1.159813
5181,2024-03-01,JTAI,-1.617811,-1.386998
752,2024-03-01,NVVE,-1.631785,-1.15905
7030,2024-03-01,VLCN,-1.701566,-2.612419
1363,2024-03-01,BCAN,-1.730342,-2.612419
688,2024-03-01,CYN,-1.752074,-2.010314
4260,2024-03-01,APVO,-1.763409,-2.612419


In [126]:
ranked_stocks.shape

(7200, 4)

In [61]:
%%sql
Select b.security_name, a.* from ranked_stocks a
join symbol b on a.act_symbol = b.act_symbol
order by expected_return_1m desc
--where a.act_symbol = 'AAPL' order by a.date desc limit 10

security_name,act_symbol,date,volume,expected_return_1m,return_1m,return_3m
Fisker Inc. Class A Common Stock,FSR,2024-03-18 00:00:00,291820461.0,0.1962310671806335,,
"JOANN, Inc. - common stock",JOAN,2024-03-18 00:00:00,11823665.0,0.1925007849931717,,
Golden Minerals Company Common Stock,AUMN,2024-03-18 00:00:00,252749.0,0.1687045991420745,1.2999999999999998,0.5000000000000001
"LivePerson, Inc. - Common Stock",LPSN,2024-03-18 00:00:00,2936407.0,0.1658003181219101,-0.43,-0.42
Ashford Inc. (Holding Company) Common Stock,AINC,2024-03-18 00:00:00,120290000.0,0.1653087139129638,1.5287958115183249,1.544502617801047
Sunnova Energy International Inc. Common Stock,NOVA,2024-03-18 00:00:00,6733783.0,0.1539326012134552,-0.1123853211009174,0.2545871559633026
"Amylyx Pharmaceuticals, Inc. - Common Stock",AMLX,2024-03-18 00:00:00,5940575.0,0.1447795182466507,-0.2972027972027972,-0.4125874125874126
Sequans Communications S.A. American Depositary Shares (each representing ten (10) Ordinary Shares),SQNS,2024-03-18 00:00:00,397011.2,0.1435878723859787,0.1836734693877549,0.2244897959183672
"Outset Medical, Inc. - Common Stock",OM,2024-03-18 00:00:00,1156939.0,0.1434604227542877,0.04,0.955
"RAPT Therapeutics, Inc. - Common Stock",RAPT,2024-03-18 00:00:00,796013.0,0.128068059682846,-0.0240096038415365,-0.5822328931572629


# Close Database Connection

In [25]:
con.close()