<a href="https://colab.research.google.com/github/jarl24-dev/stock-markets-analytics-zoomcamp/blob/main/03-modelling/Homework3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Preparation

In [48]:
!pip install yfinance



In [49]:
# read files shared via google-drive-link
# https://stackoverflow.com/questions/62759748/downloading-data-from-a-shared-google-drive-link-in-google-colab

!pip uninstall gdown -y && pip install gdown
!gdown -V

Found existing installation: gdown 5.2.0
Uninstalling gdown-5.2.0:
  Successfully uninstalled gdown-5.2.0
Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0
gdown 5.2.0 at /usr/local/lib/python3.11/dist-packages


In [50]:
# IMPORTS
import numpy as np
import pandas as pd

#Fin Data Sources
import yfinance as yf
import pandas_datareader as pdr

#Data viz
import plotly.graph_objs as go
import plotly.graph_objects as go
import plotly.express as px

import time
from datetime import date

# for graphs
import matplotlib.pyplot as plt

# 0) Dataset for Modeling: Final Preparations

## 0.1) Importing data from Drive & defining variable sets
* automated version need to have a daily updated file/database entries

In [51]:
# https://stackoverflow.com/questions/62759748/downloading-data-from-a-shared-google-drive-link-in-google-colab
# truncated data from Module 2: https://drive.google.com/file/d/1mb0ae2M5AouSDlqcUnIwaHq7avwGNrmB/view?usp=sharing
!gdown https://drive.google.com/file/d/1mb0ae2M5AouSDlqcUnIwaHq7avwGNrmB/view?usp=sharing --fuzzy -O /content/


Downloading...
From (original): https://drive.google.com/uc?id=1mb0ae2M5AouSDlqcUnIwaHq7avwGNrmB
From (redirected): https://drive.google.com/uc?id=1mb0ae2M5AouSDlqcUnIwaHq7avwGNrmB&confirm=t&uuid=d81b0e84-aea7-47ce-ab19-fbfeb470333c
To: /content/stocks_df_combined_2025_06_13.parquet.brotli
100% 130M/130M [00:01<00:00, 88.2MB/s]


In [52]:
# truncated
# df = pd.read_parquet("/content/stocks_df_combined_trunc_2014_2023.parquet.brotli", )

# full dataset for 33 stocks
df_full = pd.read_parquet("/content/stocks_df_combined_2025_06_13.parquet.brotli", )


In [53]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 230262 entries, 0 to 5700
Columns: 203 entries, Open to growth_btc_usd_365d
dtypes: datetime64[ns](3), float64(129), int32(64), int64(5), object(2)
memory usage: 302.2+ MB


In [54]:
df_full.keys()

Index(['Open', 'High', 'Low', 'Close_x', 'Volume', 'Dividends', 'Stock Splits',
       'Ticker', 'Year', 'Month',
       ...
       'growth_brent_oil_7d', 'growth_brent_oil_30d', 'growth_brent_oil_90d',
       'growth_brent_oil_365d', 'growth_btc_usd_1d', 'growth_btc_usd_3d',
       'growth_btc_usd_7d', 'growth_btc_usd_30d', 'growth_btc_usd_90d',
       'growth_btc_usd_365d'],
      dtype='object', length=203)

In [55]:
df_full[['Year','Month','Weekday']].dtypes

Unnamed: 0,0
Year,int32
Month,datetime64[ns]
Weekday,int32


In [56]:
for i in df_full.keys():
  print(i)

Open
High
Low
Close_x
Volume
Dividends
Stock Splits
Ticker
Year
Month
Weekday
Date
growth_1d
growth_3d
growth_7d
growth_30d
growth_90d
growth_365d
growth_future_30d
SMA10
SMA20
growing_moving_average
high_minus_low_relative
volatility
is_positive_growth_30d_future
ticker_type
index_x
adx
adxr
apo
aroon_1
aroon_2
aroonosc
bop
cci
cmo
dx
macd
macdsignal
macdhist
macd_ext
macdsignal_ext
macdhist_ext
macd_fix
macdsignal_fix
macdhist_fix
mfi
minus_di
mom
plus_di
dm
ppo
roc
rocp
rocr
rocr100
rsi
slowk
slowd
fastk
fastd
fastk_rsi
fastd_rsi
trix
ultosc
willr
index_y
ad
adosc
obv
atr
natr
ht_dcperiod
ht_dcphase
ht_phasor_inphase
ht_phasor_quadrature
ht_sine_sine
ht_sine_leadsine
ht_trendmod
avgprice
medprice
typprice
wclprice
index
cdl2crows
cdl3blackrows
cdl3inside
cdl3linestrike
cdl3outside
cdl3starsinsouth
cdl3whitesoldiers
cdlabandonedbaby
cdladvancedblock
cdlbelthold
cdlbreakaway
cdlclosingmarubozu
cdlconcealbabyswall
cdlcounterattack
cdldarkcloudcover
cdldoji
cdldojistar
cdldragonflydoji


In [57]:
# growth indicators (but not future growth)
GROWTH = [g for g in df_full.keys() if (g.find('growth_')==0)&(g.find('future')<0)]
GROWTH

['growth_1d',
 'growth_3d',
 'growth_7d',
 'growth_30d',
 'growth_90d',
 'growth_365d',
 'growth_dax_1d',
 'growth_dax_3d',
 'growth_dax_7d',
 'growth_dax_30d',
 'growth_dax_90d',
 'growth_dax_365d',
 'growth_snp500_1d',
 'growth_snp500_3d',
 'growth_snp500_7d',
 'growth_snp500_30d',
 'growth_snp500_90d',
 'growth_snp500_365d',
 'growth_dji_1d',
 'growth_dji_3d',
 'growth_dji_7d',
 'growth_dji_30d',
 'growth_dji_90d',
 'growth_dji_365d',
 'growth_epi_1d',
 'growth_epi_3d',
 'growth_epi_7d',
 'growth_epi_30d',
 'growth_epi_90d',
 'growth_epi_365d',
 'growth_gold_1d',
 'growth_gold_3d',
 'growth_gold_7d',
 'growth_gold_30d',
 'growth_gold_90d',
 'growth_gold_365d',
 'growth_wti_oil_1d',
 'growth_wti_oil_3d',
 'growth_wti_oil_7d',
 'growth_wti_oil_30d',
 'growth_wti_oil_90d',
 'growth_wti_oil_365d',
 'growth_brent_oil_1d',
 'growth_brent_oil_3d',
 'growth_brent_oil_7d',
 'growth_brent_oil_30d',
 'growth_brent_oil_90d',
 'growth_brent_oil_365d',
 'growth_btc_usd_1d',
 'growth_btc_usd_3d',


In [58]:
# leaving only Volume ==> generate ln(Volume)
OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']

In [59]:
CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type']

In [60]:
TO_PREDICT = [g for g in df_full.keys() if (g.find('future')>=0)]
TO_PREDICT

['growth_future_30d', 'is_positive_growth_30d_future']

In [61]:
TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV
TO_DROP

['Year',
 'Date',
 'index_x',
 'index_y',
 'index',
 'Quarter',
 'Adj Close_y',
 'Month',
 'Weekday',
 'Ticker',
 'ticker_type',
 'Open',
 'High',
 'Low',
 'Close',
 'Adj Close_x',
 'Volume']

In [62]:
# let's define on more custom numerical features
df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))

  df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))


In [63]:
# manually defined features
CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']

In [64]:
# All Supported Ta-lib indicators: https://github.com/TA-Lib/ta-lib-python/blob/master/docs/funcs.md

TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
 'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
 'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
 'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
 'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
 'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
 'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
 'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
 'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']

In [65]:
TECHNICAL_PATTERNS = [g for g in df_full.keys() if g.find('cdl')>=0]
print(f'Technical patterns count = {len(TECHNICAL_PATTERNS)}, examples = {TECHNICAL_PATTERNS[0:5]}')


Technical patterns count = 61, examples = ['cdl2crows', 'cdl3blackrows', 'cdl3inside', 'cdl3linestrike', 'cdl3outside']


In [66]:
MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
 'DGS1', 'DGS5', 'DGS10']

In [67]:
NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO

In [68]:
# CHECK: NO OTHER INDICATORS LEFT
OTHER = [k for k in df_full.keys() if k not in OHLCV + CATEGORICAL + NUMERICAL + TO_DROP]
OTHER

['Close_x',
 'Dividends',
 'Stock Splits',
 'growth_future_30d',
 'is_positive_growth_30d_future',
 'Close_y']

In [69]:
df_full.Ticker.nunique()

33

In [70]:
# tickers, min-max date, count of daily observations
df_full.groupby(['Ticker'])['Date'].agg(['min','max','count'])

Unnamed: 0_level_0,min,max,count
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,1980-12-12,2025-06-13,11217
ACN,2001-07-19,2025-06-13,6012
AMZN,1997-05-15,2025-06-13,7065
ASML,1995-03-15,2025-06-13,7614
AVGO,2009-08-06,2025-06-13,3989
BHARTIARTL.NS,2002-07-01,2025-06-13,5698
BRK-B,1996-05-09,2025-06-13,7322
CDI.PA,1992-01-27,2025-06-13,8610
GOOG,2004-08-19,2025-06-13,5239
HDB,2001-07-20,2025-06-13,6011


In [71]:
# truncated df_full with 25 years of data (and defined growth variables)
df = df_full[df_full.Date>='2000-01-01']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191795 entries, 3490 to 5700
Columns: 204 entries, Open to ln_volume
dtypes: datetime64[ns](3), float64(130), int32(64), int64(5), object(2)
memory usage: 253.1+ MB


In [72]:
# let look at the features count and df size:
df[NUMERICAL].info()

<class 'pandas.core.frame.DataFrame'>
Index: 191795 entries, 3490 to 5700
Columns: 184 entries, growth_1d to DGS10
dtypes: float64(121), int32(62), int64(1)
memory usage: 225.3 MB


## 0.2) [Code snippet 1] Generating dummies

In [73]:
# what are the categorical features?
CATEGORICAL

['Month', 'Weekday', 'Ticker', 'ticker_type']

In [74]:
# dummy variables are not generated from Date and numeric variables
df.loc[:,'Month'] = df.Month.dt.strftime('%B')
df.loc[:,'Weekday'] = df.Weekday.astype(str)

  df.loc[:,'Month'] = df.Month.dt.strftime('%B')
  df.loc[:,'Weekday'] = df.Weekday.astype(str)


In [75]:
# Generate dummy variables (no need for bool, let's have int32 instead)
dummy_variables = pd.get_dummies(df[CATEGORICAL], dtype='int32')

In [76]:
# TODO 1: define more categorical features, e.g. all combinations for <September+weekday>  (you'll see that September is actually an important dummy in one of the models)

In [77]:
dummy_variables.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191795 entries, 3490 to 5700
Data columns (total 55 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Month_April           191795 non-null  int32
 1   Month_August          191795 non-null  int32
 2   Month_December        191795 non-null  int32
 3   Month_February        191795 non-null  int32
 4   Month_January         191795 non-null  int32
 5   Month_July            191795 non-null  int32
 6   Month_June            191795 non-null  int32
 7   Month_March           191795 non-null  int32
 8   Month_May             191795 non-null  int32
 9   Month_November        191795 non-null  int32
 10  Month_October         191795 non-null  int32
 11  Month_September       191795 non-null  int32
 12  Weekday_0             191795 non-null  int32
 13  Weekday_1             191795 non-null  int32
 14  Weekday_2             191795 non-null  int32
 15  Weekday_3             191795 non-null 

In [78]:
# get dummies names in a list
DUMMIES = dummy_variables.keys().to_list()

In [79]:
DUMMIES

['Month_April',
 'Month_August',
 'Month_December',
 'Month_February',
 'Month_January',
 'Month_July',
 'Month_June',
 'Month_March',
 'Month_May',
 'Month_November',
 'Month_October',
 'Month_September',
 'Weekday_0',
 'Weekday_1',
 'Weekday_2',
 'Weekday_3',
 'Weekday_4',
 'Weekday_5',
 'Weekday_6',
 'Ticker_AAPL',
 'Ticker_ACN',
 'Ticker_AMZN',
 'Ticker_ASML',
 'Ticker_AVGO',
 'Ticker_BHARTIARTL.NS',
 'Ticker_BRK-B',
 'Ticker_CDI.PA',
 'Ticker_GOOG',
 'Ticker_HDB',
 'Ticker_HINDUNILVR.NS',
 'Ticker_IBN',
 'Ticker_IDEXY',
 'Ticker_INFY',
 'Ticker_ITC.NS',
 'Ticker_JPM',
 'Ticker_LICI.NS',
 'Ticker_LLY',
 'Ticker_LT.NS',
 'Ticker_MC.PA',
 'Ticker_META',
 'Ticker_MSFT',
 'Ticker_NVDA',
 'Ticker_NVO',
 'Ticker_OR.PA',
 'Ticker_RELIANCE.NS',
 'Ticker_RMS.PA',
 'Ticker_SAP',
 'Ticker_SBIN.NS',
 'Ticker_SIE.DE',
 'Ticker_TCS.NS',
 'Ticker_TTE',
 'Ticker_V',
 'ticker_type_EU',
 'ticker_type_INDIA',
 'ticker_type_US']

In [80]:
# Concatenate the dummy variables with the original DataFrame
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

In [81]:
df_with_dummies[NUMERICAL+DUMMIES].info()

<class 'pandas.core.frame.DataFrame'>
Index: 191795 entries, 3490 to 5700
Columns: 239 entries, growth_1d to ticker_type_US
dtypes: float64(121), int32(117), int64(1)
memory usage: 265.6 MB


## 0.3) [Code Snippet 2] Correlation analysis
* first approximation of "important" variables correlated with all variables we want to predict (TO_PREDICT)

In [82]:
TO_PREDICT

['growth_future_30d', 'is_positive_growth_30d_future']

In [83]:
corr_is_positive_growth_30d_future = df_with_dummies[NUMERICAL+DUMMIES+TO_PREDICT].corr()['is_positive_growth_30d_future']

In [84]:
# create a dataframe for an easy way to sort
corr_is_positive_growth_30d_future_df = pd.DataFrame(corr_is_positive_growth_30d_future)

In [85]:
corr_is_positive_growth_30d_future_df.sort_values(by='is_positive_growth_30d_future').head(5)

Unnamed: 0,is_positive_growth_30d_future
DGS10,-0.067204
DGS5,-0.059812
gdppot_us_yoy,-0.058374
gdppot_us_qoq,-0.058125
growth_brent_oil_365d,-0.056158


In [86]:
corr_is_positive_growth_30d_future_df.sort_values(by='is_positive_growth_30d_future').tail(8)

Unnamed: 0,is_positive_growth_30d_future
growth_btc_usd_7d,0.028577
Month_November,0.033807
Month_October,0.03541
growth_future_30d,0.696468
is_positive_growth_30d_future,1.0
cdl3starsinsouth,
cdlconcealbabyswall,
cdlmathold,


In [87]:
corr_growth_future_30d = df_with_dummies[NUMERICAL+DUMMIES+TO_PREDICT].corr()['growth_future_30d']

In [88]:
corr_growth_future_30d_df = pd.DataFrame(corr_growth_future_30d)

In [89]:
corr_growth_future_30d_df.sort_values(by='growth_future_30d').head(5)

Unnamed: 0,growth_future_30d
growth_brent_oil_365d,-0.084665
growth_dji_365d,-0.07594
growth_dax_365d,-0.060016
growth_wti_oil_365d,-0.055917
growth_snp500_365d,-0.055443


In [90]:
corr_growth_future_30d_df.sort_values(by='growth_future_30d').tail(8)

Unnamed: 0,growth_future_30d
ln_volume,0.052015
Ticker_NVDA,0.052434
Month_October,0.054752
is_positive_growth_30d_future,0.696468
growth_future_30d,1.0
cdl3starsinsouth,
cdlconcealbabyswall,
cdlmathold,


## 0.4) [Code snippet 3] Temporal split of ~25 years of data (by date)

In [91]:
def temporal_split(df, min_date, max_date, train_prop=0.7, val_prop=0.15, test_prop=0.15):
    """
    Splits a DataFrame into three buckets based on the temporal order of the 'Date' column.

    Args:
        df (DataFrame): The DataFrame to split.
        min_date (str or Timestamp): Minimum date in the DataFrame.
        max_date (str or Timestamp): Maximum date in the DataFrame.
        train_prop (float): Proportion of data for training set (default: 0.6).
        val_prop (float): Proportion of data for validation set (default: 0.2).
        test_prop (float): Proportion of data for test set (default: 0.2).

    Returns:
        DataFrame: The input DataFrame with a new column 'split' indicating the split for each row.
    """
    # Define the date intervals
    train_end = min_date + pd.Timedelta(days=(max_date - min_date).days * train_prop)
    val_end = train_end + pd.Timedelta(days=(max_date - min_date).days * val_prop)

    # Assign split labels based on date ranges
    split_labels = []
    for date in df['Date']:
        if date <= train_end:
            split_labels.append('train')
        elif date <= val_end:
            split_labels.append('validation')
        else:
            split_labels.append('test')

    # Add 'split' column to the DataFrame
    df['split'] = split_labels

    return df

In [92]:
min_date_df = df_with_dummies.Date.min()
max_date_df = df_with_dummies.Date.max()

df_with_dummies = temporal_split(df_with_dummies,
                                 min_date = min_date_df,
                                 max_date = max_date_df)

In [93]:
df_with_dummies['split'].value_counts()/len(df_with_dummies)

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,0.676399
test,0.163758
validation,0.159843


In [94]:
# remove the "segmentation" problem (warning message on df performance after many joins and data transformations)
new_df = df_with_dummies.copy()

# Q1: Dummies for Month and Week-of-Month

In [95]:
new_df['Month_WoM']=new_df['Month']+'_w'+(new_df['Date'].apply(lambda d: (d.day - 1) // 7 + 1)).astype(str)

In [96]:
new_dummy_variables = pd.get_dummies(new_df['Month_WoM'], dtype='int32')

In [97]:
new_dummy_variables.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191795 entries, 3490 to 5700
Data columns (total 60 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   April_w1      191795 non-null  int32
 1   April_w2      191795 non-null  int32
 2   April_w3      191795 non-null  int32
 3   April_w4      191795 non-null  int32
 4   April_w5      191795 non-null  int32
 5   August_w1     191795 non-null  int32
 6   August_w2     191795 non-null  int32
 7   August_w3     191795 non-null  int32
 8   August_w4     191795 non-null  int32
 9   August_w5     191795 non-null  int32
 10  December_w1   191795 non-null  int32
 11  December_w2   191795 non-null  int32
 12  December_w3   191795 non-null  int32
 13  December_w4   191795 non-null  int32
 14  December_w5   191795 non-null  int32
 15  February_w1   191795 non-null  int32
 16  February_w2   191795 non-null  int32
 17  February_w3   191795 non-null  int32
 18  February_w4   191795 non-null  int32
 19  Februa

In [98]:
NEW_DUMMIES = new_dummy_variables.keys().to_list()

In [99]:
# Concatenate the dummy variables with the original DataFrame
new_df_with_dummies = pd.concat([new_df, new_dummy_variables], axis=1)

In [100]:
corr_is_positive_growth_30d_future = new_df_with_dummies[NEW_DUMMIES+TO_PREDICT].corr()['is_positive_growth_30d_future']

In [101]:
corr_is_positive_growth_30d_future_df = pd.DataFrame(corr_is_positive_growth_30d_future)

In [102]:
corr_is_positive_growth_30d_future_df['abs_corr'] = corr_is_positive_growth_30d_future_df['is_positive_growth_30d_future'].abs()

In [103]:
corr_is_positive_growth_30d_future_df.sort_values(by='abs_corr', ascending=False).head(5)

Unnamed: 0,is_positive_growth_30d_future,abs_corr
is_positive_growth_30d_future,1.0,1.0
growth_future_30d,0.696468,0.696468
October_w4,0.024968,0.024968
November_w3,0.022097,0.022097
November_w2,0.018822,0.018822


# Q2: Define New "Hand" Rules on Macro and Technical Indicator Variables

In [104]:
# remove the "segmentation" problem (warning message on df performance after many joins and data transformations)
new_df = new_df_with_dummies.copy()

In [105]:
for g in df_full.keys():
  if (g.find('DGS')==0):
    print(g)

DGS1
DGS5
DGS10


In [106]:
for g in df_full.keys():
  if (g.find('FEDFUNDS')>=0):
    print(g)

FEDFUNDS


In [107]:
# generate manual predictions
# Let's label all prediction features with prefix "pred"
new_df['pred0_manual_cci'] = (new_df.cci>200).astype(int)
new_df['pred1_manual_prev_g1'] = (new_df.growth_30d>1).astype(int)
new_df['pred2_manual_prev_g1_and_snp'] = ((new_df['growth_30d'] > 1) & (new_df['growth_snp500_30d'] > 1)).astype(int)

In [108]:
new_df['pred3_manual_dgs10_5'] = ((new_df.DGS10 <= 4) & (new_df.DGS5 <= 1)).astype(int)
new_df['pred4_manual_dgs10_fedfunds'] = ((new_df.DGS10 > 4) & (new_df.FEDFUNDS <= 4.795)).astype(int)

In [111]:
new_df['pred3_manual_dgs10_5'].value_counts()

Unnamed: 0_level_0,count
pred3_manual_dgs10_5,Unnamed: 1_level_1
0,165557
1,26238


In [112]:
new_df['pred4_manual_dgs10_fedfunds'].value_counts()

Unnamed: 0_level_0,count
pred4_manual_dgs10_fedfunds,Unnamed: 1_level_1
0,155524
1,36271


In [113]:
# check "Precision" : the percentage of "correct" predictions , WHEN we predict "1" (POSITIVE future growth)
new_df['is_correct_prediction'] = (new_df.pred4_manual_dgs10_fedfunds == new_df.is_positive_growth_30d_future)
filter = (new_df.split=='test') & (new_df.pred4_manual_dgs10_fedfunds==1)
new_df[filter].is_correct_prediction.value_counts()

Unnamed: 0_level_0,count
is_correct_prediction,Unnamed: 1_level_1
False,3020
True,2640


In [114]:
# %% of correct predictions : 46.6%
new_df[filter].is_correct_prediction.value_counts() / len(new_df[filter])

Unnamed: 0_level_0,count
is_correct_prediction,Unnamed: 1_level_1
False,0.533569
True,0.466431


In [115]:
# check "Precision" : the percentage of "correct" predictions , WHEN we predict "1" (POSITIVE future growth)
new_df['is_correct_prediction'] = (new_df.pred3_manual_dgs10_5 == new_df.is_positive_growth_30d_future)
filter = (new_df.split=='test') & (new_df.pred3_manual_dgs10_5==1)

# %% of correct predictions : 57.974%
new_df[filter].is_correct_prediction.value_counts() / len(new_df[filter])

Unnamed: 0_level_0,count
is_correct_prediction,Unnamed: 1_level_1
True,0.579739
False,0.420261


In [116]:
# delete this column
del new_df["is_correct_prediction"]

In [118]:
PREDICTIONS = [k for k in new_df.keys() if k.startswith('pred')]
PREDICTIONS

['pred0_manual_cci',
 'pred1_manual_prev_g1',
 'pred2_manual_prev_g1_and_snp',
 'pred3_manual_dgs10_5',
 'pred4_manual_dgs10_fedfunds']

In [119]:
# generate columns is_correct_
for pred in PREDICTIONS:
  part1 = pred.split('_')[0] # first prefix before '_'
  new_df[f'is_correct_{part1}'] =  (new_df[pred] == new_df.is_positive_growth_30d_future).astype(int)

In [120]:
# IS_CORRECT dataset
IS_CORRECT =  [k for k in new_df.keys() if k.startswith('is_correct_')]
IS_CORRECT

['is_correct_pred0',
 'is_correct_pred1',
 'is_correct_pred2',
 'is_correct_pred3',
 'is_correct_pred4']

In [121]:
len(new_df[new_df.split=='test'])

31408

In [122]:
# define "Precision" for ALL predictions on a Test dataset (~4 last years of trading)
for i,column in enumerate(IS_CORRECT):
  prediction_column = PREDICTIONS[i]
  is_correct_column = column
  filter = (new_df.split=='test') & (new_df[prediction_column]==1)
  print(f'Prediction column:{prediction_column} , is_correct_column: {is_correct_column}')
  print(new_df[filter][is_correct_column].value_counts())
  print(new_df[filter][is_correct_column].value_counts()/len(new_df[filter]))

  print('---------')

Prediction column:pred0_manual_cci , is_correct_column: is_correct_pred0
is_correct_pred0
1    443
0    351
Name: count, dtype: int64
is_correct_pred0
1    0.557935
0    0.442065
Name: count, dtype: float64
---------
Prediction column:pred1_manual_prev_g1 , is_correct_column: is_correct_pred1
is_correct_pred1
1    9748
0    8243
Name: count, dtype: int64
is_correct_pred1
1    0.541826
0    0.458174
Name: count, dtype: float64
---------
Prediction column:pred2_manual_prev_g1_and_snp , is_correct_column: is_correct_pred2
is_correct_pred2
1    6984
0    6383
Name: count, dtype: int64
is_correct_pred2
1    0.522481
0    0.477519
Name: count, dtype: float64
---------
Prediction column:pred3_manual_dgs10_5 , is_correct_column: is_correct_pred3
is_correct_pred3
1    578
0    419
Name: count, dtype: int64
is_correct_pred3
1    0.579739
0    0.420261
Name: count, dtype: float64
---------
Prediction column:pred4_manual_dgs10_fedfunds , is_correct_column: is_correct_pred4
is_correct_pred4
0    30

# Q3: Unique Correct Predictions from a 10-Level Decision Tree Classifier

In [123]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [124]:
# Decision Tree doesn't like too large and inf. values
import numpy as np

def remove_infinite_values(X):
    """
    Remove infinite values from the input array.

    Parameters:
    - X: Input array (NumPy array or array-like)

    Returns:
    - Array with infinite values removed
    """
    return X[np.isfinite(X).all(axis=1)]

# Example usage:
# Assuming X is your input data
# filtered_X = remove_infinite_values(X)

In [125]:
# look carefully for 'count' to be close to total values (or you need to replace NaNs/remove NaNs), and min/max doesn't equal to -+inf.
#  it will give you an idea to dig deeper into some features to understand the 'nature' of a problem
pd.set_option('display.max_rows', None)

new_df[NUMERICAL+DUMMIES+NEW_DUMMIES].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
growth_1d,191780.0,1.000859,0.02452123,0.230055,0.9906947,1.000476,1.010665,4.374775
growth_3d,191750.0,1.002521,0.03993495,0.242517,0.9842335,1.002102,1.02013,4.446675
growth_7d,191690.0,1.00578,0.0585267,0.2326554,0.9773162,1.005334,1.033346,4.599882
growth_30d,191345.0,1.024514,0.1175381,0.2461313,0.9617043,1.021615,1.083333,5.179689
growth_90d,190445.0,1.075941,0.2187319,0.1890118,0.9571314,1.060631,1.173104,5.97005
growth_365d,186035.0,1.357578,0.6215502,0.09783037,1.019063,1.244345,1.533772,9.819904
growth_dax_1d,188637.0,1.000281,0.01405046,0.8776139,0.9939059,1.000773,1.00713,1.11402
growth_dax_3d,188637.0,1.000911,0.02396851,0.8374862,0.9894033,1.002214,1.01391,1.144124
growth_dax_7d,188637.0,1.002258,0.03562933,0.7318924,0.9850482,1.004862,1.021715,1.23246
growth_dax_30d,188637.0,1.009702,0.07119843,0.6263172,0.9743297,1.015596,1.051303,1.288371


In [126]:
# Split the data into training and testing sets based on the split date
features_list = NUMERICAL+DUMMIES+NEW_DUMMIES
to_predict = 'is_positive_growth_30d_future'

train_df = new_df[new_df.split.isin(['train','validation'])].copy(deep=True)
test_df = new_df[new_df.split.isin(['test'])].copy(deep=True)

# ONLY numerical Separate features and target variable for training and testing sets
# need Date and Ticker later when merging predictions to the dataset
X_train = train_df[features_list+[to_predict,'Date','Ticker']]
X_test = test_df[features_list+[to_predict,'Date','Ticker']]

print(f'length: X_train {X_train.shape},  X_test {X_test.shape}')

length: X_train (160387, 302),  X_test (31408, 302)


In [127]:
# Can't have +-inf values . E.g. ln(volume)=-inf when volume==0 => substitute with 0

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(f'length: X_train_imputed {X_train.shape},  X_test_imputed {X_test.shape}')

length: X_train_imputed (160387, 302),  X_test_imputed (31408, 302)


In [128]:
# you may want to remove 1-2% outliers based on percentile ==> not used here in Decision Trees
def remove_outliers_percentile(X, lower_percentile=1, upper_percentile=99):
    """
    Remove outliers from the input array based on percentiles.

    Parameters:
    - X: Input array (NumPy array or array-like)
    - lower_percentile: Lower percentile threshold (float, default=1)
    - upper_percentile: Upper percentile threshold (float, default=99)

    Returns:
    - Array with outliers removed
    """
    lower_bound = np.percentile(X, lower_percentile, axis=0)
    upper_bound = np.percentile(X, upper_percentile, axis=0)
    mask = np.logical_and(np.all(X >= lower_bound, axis=1), np.all(X <= upper_bound, axis=1))
    return X[mask]

# Example usage:
# Assuming X is your input data
# filtered_X = remove_outliers_percentile(X, lower_percentile=1, upper_percentile=99)

In [129]:
X_train_imputed = X_train # we won't use outliers removal to save more data to train: remove_outliers_percentile(X_train)
X_test_imputed = X_test # we won't use outliers removal to save more data to test: remove_outliers_percentile(X_test)

In [130]:
# same shape
print(f'length: X_train_imputed {X_train_imputed.shape},  X_test_imputed {X_test_imputed.shape}')

length: X_train_imputed (160387, 302),  X_test_imputed (31408, 302)


In [131]:
y_train = X_train_imputed[to_predict]
y_test = X_test_imputed[to_predict]

# remove y_train, y_test from X_ dataframes
del X_train_imputed[to_predict]
del X_test_imputed[to_predict]

In [133]:
# estimation/fit function (using dataframe of features X and what to predict y) --> optimising total accuracy
# max_depth is hyperParameter
def fit_decision_tree(X, y, max_depth=20):
# Initialize the Decision Tree Classifier
  clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)

  # Fit the classifier to the training data
  clf.fit(X, y)
  return clf, X.columns

In [134]:
%%time
clf_10, train_columns = fit_decision_tree(X=X_train_imputed.drop(['Date','Ticker'],axis=1),
                           y=y_train,
                           max_depth=10)

CPU times: user 29.1 s, sys: 249 ms, total: 29.4 s
Wall time: 32.3 s


In [136]:
def predict_decision_tree(clf:DecisionTreeClassifier, df_X:pd.DataFrame, y_true: pd.Series):
  # Predict the target variable on the test data
  y_pred = clf.predict(df_X)

  max_depth = clf.tree_.max_depth
  # Print the maximum depth
  print("Maximum depth of the decision tree:", max_depth)

  # Calculate the accuracy/precision of the model
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  print(f'Accuracy ={accuracy}, precision = {precision}')

  # resulting df
  result_df = pd.concat([df_X, y_true, pd.Series(y_pred, index=df_X.index, name='pred_')], axis=1)

  return result_df

In [137]:
# ONLY numerical Separate features and target variable for training and testing sets
# need Date and Ticker later when merging predictions to the dataset
X_all = new_df[features_list+[to_predict,'Date','Ticker']]

# Can't have +-inf values . E.g. ln(volume)=-inf when volume==0 => substitute with 0

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

X_all.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_all.fillna(0, inplace=True)

print(f'length: X_train_imputed {X_all.shape}')

X_all_imputed = X_all # we won't use outliers removal to save more data to train: remove_outliers_percentile(X_train)

# same shape
print(f'length: X_train_imputed {X_all_imputed.shape}')

y_all = X_all_imputed[to_predict]

# remove y_train, y_test from X_ dataframes
del X_all_imputed[to_predict]

pred5_clf_10 = predict_decision_tree(clf_10, X_all_imputed.drop(['Date','Ticker'],axis=1), y_all)

length: X_train_imputed (191795, 302)
length: X_train_imputed (191795, 302)
Maximum depth of the decision tree: 10
Accuracy =0.6958314867436586, precision = 0.6950666328571831


In [138]:
new_df['pred5_clf_10'] = pred5_clf_10['pred_']

In [139]:
PREDICTIONS = [k for k in new_df.keys() if k.startswith('pred')]
PREDICTIONS

['pred0_manual_cci',
 'pred1_manual_prev_g1',
 'pred2_manual_prev_g1_and_snp',
 'pred3_manual_dgs10_5',
 'pred4_manual_dgs10_fedfunds',
 'pred5_clf_10']

In [140]:
# IS_CORRECT dataset
IS_CORRECT =  [k for k in new_df.keys() if k.startswith('is_correct_')]
IS_CORRECT

['is_correct_pred0',
 'is_correct_pred1',
 'is_correct_pred2',
 'is_correct_pred3',
 'is_correct_pred4']

In [141]:
new_df['is_correct_pred5'] = (new_df.pred5_clf_10 == new_df.is_positive_growth_30d_future).astype(int)

In [142]:
new_df['only_pred5_is_correct'] = (
    (new_df['is_correct_pred5'] == 1) &
    (new_df[IS_CORRECT].sum(axis=1) == 0)
).astype(int)


In [146]:
Positive_correct_pred = new_df[(new_df['split'] == 'test') & (new_df['only_pred5_is_correct'] == 1)].shape[0]

print(f'Unique correct predictions by pred5_clf_10 in test set:{Positive_correct_pred}')

Unique correct predictions by pred5_clf_10 in test set:3770


# Q4: Hyperparameter tuning for a Decision Tree

## Data Preparation

In [153]:
# Split the data into training and testing sets based on the split date
features_list = NUMERICAL+DUMMIES+NEW_DUMMIES
to_predict = 'is_positive_growth_30d_future'

train_df = new_df[new_df.split.isin(['train','validation'])].copy(deep=True)
test_df = new_df[new_df.split.isin(['test'])].copy(deep=True)

# ONLY numerical Separate features and target variable for training and testing sets
# need Date and Ticker later when merging predictions to the dataset
X_train = train_df[features_list+[to_predict,'Date','Ticker']]
X_test = test_df[features_list+[to_predict,'Date','Ticker']]

print(f'length: X_train {X_train.shape},  X_test {X_test.shape}')

# Can't have +-inf values . E.g. ln(volume)=-inf when volume==0 => substitute with 0

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(f'length: X_train_imputed {X_train.shape},  X_test_imputed {X_test.shape}')

X_train_imputed = X_train # we won't use outliers removal to save more data to train: remove_outliers_percentile(X_train)
X_test_imputed = X_test # we won't use outliers removal to save more data to test: remove_outliers_percentile(X_test)

# same shape
print(f'length: X_train_imputed {X_train_imputed.shape},  X_test_imputed {X_test_imputed.shape}')

y_train = X_train_imputed[to_predict]
y_test = X_test_imputed[to_predict]

# remove y_train, y_test from X_ dataframes
del X_train_imputed[to_predict]
del X_test_imputed[to_predict]

length: X_train (160387, 302),  X_test (31408, 302)
length: X_train_imputed (160387, 302),  X_test_imputed (31408, 302)
length: X_train_imputed (160387, 302),  X_test_imputed (31408, 302)


## Best mode

In [152]:
for depth in range(1,21):
  clf,_ = fit_decision_tree(X=X_train_imputed.drop(['Date','Ticker'],axis=1),
                                            y=y_train,
                                            max_depth=depth)
  y_pred = clf.predict(X_test_imputed.drop(['Date','Ticker'],axis=1))
  # Calculate the accuracy/precision of the model
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  print(f'Max_depth: {depth} - Accuracy ={accuracy}, precision = {precision}')

Max_depth: 1 - Accuracy =0.5410404992358635, precision = 0.5465816507261778
Max_depth: 2 - Accuracy =0.5510697911360163, precision = 0.5510697911360163
Max_depth: 3 - Accuracy =0.5510697911360163, precision = 0.5510697911360163
Max_depth: 4 - Accuracy =0.5510697911360163, precision = 0.5510697911360163
Max_depth: 5 - Accuracy =0.5991467142129394, precision = 0.627845220030349
Max_depth: 6 - Accuracy =0.5498599083036169, precision = 0.5691415110800907
Max_depth: 7 - Accuracy =0.5701732042791645, precision = 0.5938485804416404
Max_depth: 8 - Accuracy =0.5762226184411615, precision = 0.5897136702270891
Max_depth: 9 - Accuracy =0.5642193071828834, precision = 0.5915457349446327
Max_depth: 10 - Accuracy =0.5571510443199185, precision = 0.5889278426037361
Max_depth: 11 - Accuracy =0.5625318390219053, precision = 0.5947726306842329
Max_depth: 12 - Accuracy =0.5486500254712176, precision = 0.5788757932910245
Max_depth: 13 - Accuracy =0.5547631176770249, precision = 0.5828431861230187
Max_depth