# Imports and Installs

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 0 One DataFrame with Dummies

### 0.1 Read and truncate data (last 25 years from 2000), define variable sets


In [2]:
# full dataset for 33 stocks
df_full = pd.read_parquet("stocks_df_combined_2024_05_07.parquet.brotli")

In [3]:
# growth indicators (but not future growth)
GROWTH = [g for g in df_full.keys() if (g.find('growth_')==0)&(g.find('future')<0)]
# leaving only Volume ==> generate ln(Volume)
OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']
CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type']

# TODO HA4 Q3: Define a new variable 'is_strong_positive_growth_5d_future' according to the task
#  it will be automatically added the the list TO_PREDICT
df_full["is_strong_positive_growth_5d_future"] = (df_full["growth_future_5d"] >= 1.02).astype(int)


TO_PREDICT = [g for g in df_full.keys() if (g.find('future')>=0)]
TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV

# let's define on more custom numerical features
df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))

# manually defined features
CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']
# All Supported Ta-lib indicators: https://github.com/TA-Lib/ta-lib-python/blob/master/docs/funcs.md
TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
                        'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
                        'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
                        'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
                        'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
                        'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
                        'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
                        'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
                        'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']
TECHNICAL_PATTERNS = [g for g in df_full.keys() if g.find('cdl')>=0]
MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
         'DGS1', 'DGS5', 'DGS10']

NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO
# CHECK: NO OTHER INDICATORS LEFT
OTHER = [k for k in df_full.keys() if k not in OHLCV + CATEGORICAL + NUMERICAL + TO_DROP + TO_PREDICT]

# truncated df_full with 25 years of data (and defined growth variables)
df = df_full[df_full.Date>='2000-01-01'].copy()

  df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))


### 0.2 Dummies

In [4]:
# dummy variables can't be generated from Date and numeric variables ==> convert to STRING (to define groups for Dummies)
df.loc[:,'Month'] = df.Month.dt.strftime('%B')
df.loc[:,'Weekday'] = df.Weekday.astype(str)
# define week of month
df.loc[:,'wom'] = df.Date.apply(lambda d: (d.day-1)//7 + 1)
# convert to string
df.loc[:,'wom'] = df.loc[:,'wom'].astype(str)
df.loc[:,'month_wom'] = df.Month + '_w' + df.wom
# del wom temp variable
del df['wom']

CATEGORICAL.append('month_wom')
# Generate dummy variables (no need for bool, let's have int32 instead)
dummy_variables = pd.get_dummies(df[CATEGORICAL], dtype='int32')
# get dummies names in a list
DUMMIES = dummy_variables.keys().to_list()
# Concatenate the dummy variables with the original DataFrame
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

  df.loc[:,'Month'] = df.Month.dt.strftime('%B')
  df.loc[:,'Weekday'] = df.Weekday.astype(str)
  df.loc[:,'wom'] = df.loc[:,'wom'].astype(str)


### 0.3 Temporal split

In [5]:
def temporal_split(df, min_date, max_date, train_prop=0.7, val_prop=0.15, test_prop=0.15):
    """
    Splits a DataFrame into three buckets based on the temporal order of the 'Date' column.

    Args:
        df (DataFrame): The DataFrame to split.
        min_date (str or Timestamp): Minimum date in the DataFrame.
        max_date (str or Timestamp): Maximum date in the DataFrame.
        train_prop (float): Proportion of data for training set (default: 0.7).
        val_prop (float): Proportion of data for validation set (default: 0.15).
        test_prop (float): Proportion of data for test set (default: 0.15).

    Returns:
        DataFrame: The input DataFrame with a new column 'split' indicating the split for each row.
    """
    # Define the date intervals
    train_end = min_date + pd.Timedelta(days=(max_date - min_date).days * train_prop)
    val_end = train_end + pd.Timedelta(days=(max_date - min_date).days * val_prop)

    # Assign split labels based on date ranges
    split_labels = []
    for date in df['Date']:
        if date <= train_end:
            split_labels.append('train')
        elif date <= val_end:
            split_labels.append('validation')
        else:
            split_labels.append('test')

    # Add 'split' column to the DataFrame
    df['split'] = split_labels

    return df

In [6]:
min_date_df = df_with_dummies.Date.min()
max_date_df = df_with_dummies.Date.max()

df_with_dummies = temporal_split(df_with_dummies,
                                 min_date = min_date_df,
                                 max_date = max_date_df)
# remove the "segmentation" problem (warning message on df performance after many joins and data transformations)
new_df = df_with_dummies.copy()

In [7]:
# EXACT DATES for the split:
# time split on train/validation/test: FIXED dates of split, approx. 70%, 15%, 15% split
new_df.groupby(['split'])['Date'].agg({'min','max','count'})

Unnamed: 0_level_0,min,max,count
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,2020-09-14,2024-05-07,29829
train,2000-01-03,2017-01-16,123458
validation,2017-01-17,2020-09-11,29388


### 0.4 Define dataframes for Modeling (ML) and cleane them

In [8]:
# Prepare a dataframe for ML:
# +-inf to NaN, all NaNs to 0s
def clean_dataframe_from_inf_and_nan(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    return df

In [9]:
# new_df = clean_dataframe_from_inf_and_nan(new_df)
# features_list = NUMERICAL + DUMMIES
# to_predict = 'is_positive_growth_5d_future'
# 
# train_df = new_df[new_df.split.isin(['train'])].copy()
# valid_df = new_df[new_df.split.isin(['validation'])].copy()
# train_valid_df = new_df[new_df.split.isin(['train','validation'])].copy()
# test_df =  new_df[new_df.split.isin(['test'])].copy()
# 
# # ONLY numerical Separate features and target variable for training and testing sets
# X_train = train_df[features_list]
# X_valid = valid_df[features_list]
# X_train_valid = train_valid_df[features_list]
# X_test = test_df[features_list]
# # this to be used for predictions and join to the original dataframe new_df
# X_all =  new_df[features_list].copy()
# 
# y_train = train_df[to_predict]
# y_valid = valid_df[to_predict]
# y_train_valid = train_valid_df[to_predict]
# y_test = test_df[to_predict]
# y_all =  new_df[to_predict]

In [10]:
# Features to be used in predictions (incl. new dummies)
features_list = NUMERICAL + DUMMIES

# What we're trying to predict?
to_predict = 'is_positive_growth_5d_future'

train_df = new_df[new_df.split.isin(['train'])].copy()
valid_df = new_df[new_df.split.isin(['validation'])].copy()
train_valid_df = new_df[new_df.split.isin(['train','validation'])].copy()
test_df =  new_df[new_df.split.isin(['test'])].copy()

X_train = train_df[features_list].copy()
X_valid = valid_df[features_list].copy()
X_train_valid = train_valid_df[features_list].copy()
X_test = test_df[features_list].copy()
# this to be used for predictions and join to the original dataframe new_df
X_all =  new_df[features_list].copy()

y_train = train_df[to_predict]
y_valid = valid_df[to_predict]
y_train_valid = train_valid_df[to_predict]
y_test = test_df[to_predict]
y_all =  new_df[to_predict]

print(f'length: X_train {X_train.shape},  X_validation {X_valid.shape}, X_test {X_test.shape}, X_train_valid = {X_train_valid.shape},  all combined: X_all {X_all.shape}')

length: X_train (123458, 299),  X_validation (29388, 299), X_test (29829, 299), X_train_valid = (152846, 299),  all combined: X_all (182675, 299)


In [11]:
# Clean from +-inf and NaNs:
X_train = clean_dataframe_from_inf_and_nan(X_train)
X_valid = clean_dataframe_from_inf_and_nan(X_valid)
X_train_valid = clean_dataframe_from_inf_and_nan(X_train_valid)
X_test = clean_dataframe_from_inf_and_nan(X_test)
X_all = clean_dataframe_from_inf_and_nan(X_all)

In [12]:
# generate manual predictions
# Let's label all prediction features with prefix "pred"
new_df['pred0_manual_cci'] = (new_df.cci>200).astype(int)
new_df['pred1_manual_prev_g1'] = (new_df.growth_1d>1).astype(int)
new_df['pred2_manual_prev_g1_and_snp'] = ((new_df['growth_1d'] > 1) & (new_df['growth_snp500_1d'] > 1)).astype(int)
new_df['pred3_manual_gdp_and_fastd'] = ((new_df['gdppot_us_yoy'] <= 0.027) & (new_df['fastd'] >=0.251)).astype(int)
new_df['pred4_manual_gdp_and_wti30d'] = ((new_df['gdppot_us_yoy'] >= 0.027) & (new_df['growth_wti_oil_30d'] <= 1.005)).astype(int)

In [13]:
clf_10 = DecisionTreeClassifier(max_depth=10,
                                random_state=42)
clf_10.fit(X_train_valid, y_train_valid)
# predict on a full dataset
y_pred_all = clf_10.predict(X_all)
# defining a new prediction vector is easy now, as the dimensions will match
new_df['pred5_clf_10'] = y_pred_all

In [14]:
best_depth = 15
clf_best = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
clf_best.fit(X_train_valid, y_train_valid)
# predict on a full dataset
y_pred_clf_best = clf_best.predict(X_all)
# defining a new prediction vector is easy now, as the dimensions will match
new_df['pred6_clf_best'] = y_pred_clf_best

In [15]:
# adding Decision Tree predictors (clf_best) to the dataset for 2 new rules: Threshold = 0.66 and 0.78
y_pred_all_prob = clf_best.predict_proba(X_all)[:,1]

# defining a new prediction vector is easy now, as the dimensions will match
# new_df['proba_pred7'] = y_pred_all_class1_array
new_df['pred7_clf_best_rule_66'] = (y_pred_all_prob >= 0.66).astype(int)

# new_df['proba_pred8'] = y_pred_all_class1_array
new_df['pred8_clf_best_rule_78'] = (y_pred_all_prob >= 0.78).astype(int)

## Question 1 (2 points): Find the new global best CAGR with Random Forest tuning

**The idea**: You may have noticed that the Random Forest predictions provide close to maximum CAGR results ("1.1291" for pred9_rf_best_rule_55, "1.0923" for pred10_rf_best_rule_60) with less effort (average trades per day).

In this task, you are asked to define new predictors with custom threshold rules from 0.51 to 0.54 and from 0.56 to 0.59 to cover the full interval from 0.51 to 0.60. You should be able to observe one peak of financial performance and understand why we don't need to extend the interval with more predictions on other thresholds.


1) **Before defining new predictors:** modify the code to generate the dataframe `df_scores` for Random Forest classifier `rf_best` and not a Decision Tree (`clf_best`). Look at for precision/recall rates for different threshold values. You'll see that the precision rate goes up with a higher threshold, but recall (and total number of deals) goes down. However, don't be misled by this, as the financial simulation may show a different view (even with higher precision and fewer trades, you may obtain worse results).

2) Define new Random Forest predictions (with names `predxx_rf_best_rule_yy`) based on the missing thresholds to cover the full interval 0.51..0.60

3) Review the last dataframe with the simulation results (`df_sim1_results`).
Find the best simulation results. Do you see that one of the new predictors could deliver the new best CAGR?
Write down the new best CAGR value as an answer.

In [16]:
# Suboptimal Predictor with less (than optimal) Estimators (200) and lower Max_Depth (17)
# several minutes to compute (6 min)

rf_best = RandomForestClassifier(n_estimators = 200,
                                 max_depth = 17,
                                 random_state = 42,
                                 n_jobs = -1)
rf_best = rf_best.fit(X_train_valid, y_train_valid)

In [17]:
y_pred_all_prob = rf_best.predict_proba(X_all)[:,1]

# TODO HA4 Q1: define new predictor for thresholds 0.51..0.54, 0.56..0.59
for i, threshold in enumerate(np.arange(0.51, 0.61, 0.01), start=9):
    new_df[f"pred{i}_rf_best_rule{int(100*threshold)}"] = (y_pred_all_prob >= threshold).astype(int)

In [18]:
def get_simulate_result(df):

    sim_results = []
    PREDICTIONS = [k for k in df.keys() if k.startswith('pred')]
    OTHERS = ["split", "growth_future_5d", "Date"]
    sim_df = df[PREDICTIONS + OTHERS].copy()
    # sim_df = df.copy()
    # PREDICTIONS = [k for k in sim_df.keys() if k.startswith('pred')]

    # Iterate over all predictions
    for pred in PREDICTIONS:
        print(f'Calculating sumulation for prediction {pred}:')
        print(f"    Count times of investment {len(sim_df[(sim_df.split=='test')&(sim_df[pred]==1)])} out of {len(sim_df[(sim_df.split=='test')])} TEST records")

        # Prefix: e.g. pred1 or pred10
        pred_prefix= pred.split('_')[0]

        # Fin. result columns: define new records for EACH positive prediction
        sim_df['sim_gross_rev_'+pred_prefix] = sim_df[pred] * 100 * (sim_df['growth_future_5d']-1)
        sim_df['sim_fees_'+pred_prefix] = -sim_df[pred] * 100 * 0.002
        sim_df['sim_net_rev_'+pred_prefix] = sim_df['sim_gross_rev_'+pred_prefix] + sim_df['sim_fees_'+pred_prefix]

        # calculate agg. results for each PREDICTION columns (pred) on TEST
        filter_test_and_positive_pred = (sim_df.split=='test')&(sim_df[pred]==1) # filter records on TEST set, when current prediction is 1 (we invest $100 for 1 week ahead - 5 periods)
        sim_count_investments = len(sim_df[filter_test_and_positive_pred])
        sim_gross_rev = sim_df[filter_test_and_positive_pred]['sim_gross_rev_'+pred_prefix].sum()
        sim_fees = sim_df[filter_test_and_positive_pred]['sim_fees_'+pred_prefix].sum()
        sim_net_rev = sim_df[filter_test_and_positive_pred]['sim_net_rev_'+pred_prefix].sum()

        if sim_gross_rev>0:
            sim_fees_percentage = -sim_fees/sim_gross_rev
        else:
            sim_fees_percentage = None

        if sim_count_investments>0:
            sim_average_net_revenue = sim_net_rev/sim_count_investments
        else:
            sim_average_net_revenue = None

        # APPROXIMATE CAPITAL REQUIRED and CAGR Calculation
        df_investments_count_daily = pd.DataFrame(sim_df[filter_test_and_positive_pred].groupby('Date')[pred].count())
        sim_avg_investments_per_day = df_investments_count_daily[pred].mean()
        sim_q75_investments_per_day = df_investments_count_daily[pred].quantile(0.75)  # 75% case - how many $100 investments per day do we have?
        # df_investments_count_daily[pred].mean()
        sim_capital = 100 * 5 * sim_q75_investments_per_day # 5 days in a row with positive predictions
        # CAGR: average growth per year. E.g. if you have 1.5 return (50% growth in 4 years) --> (1.5)**(1/4) = 1.106 or 10.6% average
        sim_CAGR = ((sim_capital+sim_net_rev)/sim_capital)**(1/4)

        # append to DF
        sim_results.append((pred,sim_count_investments,sim_gross_rev,sim_fees,sim_net_rev,sim_fees_percentage,sim_average_net_revenue,sim_avg_investments_per_day,sim_capital,sim_CAGR))


        # output for all predictions with some positive predictions
        if  sim_count_investments>1:
            print(f"    Financial Result: \n {sim_df[filter_test_and_positive_pred][['sim_gross_rev_'+pred_prefix,'sim_fees_'+pred_prefix,'sim_net_rev_'+pred_prefix]].sum()}")
            print(f"        Count Investments in 4 years (on TEST): {sim_count_investments}")
            print(f"        Gross Revenue: ${int(sim_gross_rev)}")
            print(f"        Fees (0.2% for buy+sell): ${int(-sim_fees)}")
            print(f"        Net Revenue: ${int(sim_net_rev)}")
            print(f"        Fees are {int(-100.0*sim_fees/sim_gross_rev)} % from Gross Revenue")
            print(f"        Capital Required : ${int(sim_capital)} (Vbegin)")
            print(f"        Final value (Vbegin + Net_revenue) : ${int(sim_capital + sim_net_rev)} (Vfinal)")

            print(f"        Average CAGR on TEST (4 years) : {np.round(sim_CAGR,3)}, or {np.round(100.0*(sim_CAGR-1),1)}% ")

            print(f"        Average daily stats: ")
            print(f"            Average net revenue per investment: ${np.round(sim_net_rev/sim_count_investments,2)} ")
            print(f"            Average investments per day: {int(np.round(sim_avg_investments_per_day))} ")
            print(f"            Q75 investments per day: {int(np.round(sim_q75_investments_per_day))} ")
            print('=============================================+')


    # results in a DataFrame from an Array
    columns_simulation = ['prediction', 'sim_count_investments', 'sim_gross_rev', 'sim_fees', 'sim_net_rev', 'sim_fees_percentage','sim_average_net_revenue','sim_avg_investments_per_day','sim_capital','sim_CAGR']

    df_sim_results = pd.DataFrame(sim_results,columns=columns_simulation)
    df_sim_results['sim_growth_capital_4y'] = (df_sim_results.sim_net_rev+df_sim_results.sim_capital) / df_sim_results.sim_capital
    return sim_df, df_sim_results

In [19]:
sim_df, sim_results = get_simulate_result(new_df)
sim_results

Calculating sumulation for prediction pred0_manual_cci:
    Count times of investment 799 out of 29829 TEST records
    Financial Result: 
 sim_gross_rev_pred0    309.04589
sim_fees_pred0        -159.80000
sim_net_rev_pred0      149.84589
dtype: float64
        Count Investments in 4 years (on TEST): 799
        Gross Revenue: $309
        Fees (0.2% for buy+sell): $159
        Net Revenue: $149
        Fees are 51 % from Gross Revenue
        Capital Required : $1000 (Vbegin)
        Final value (Vbegin + Net_revenue) : $1149 (Vfinal)
        Average CAGR on TEST (4 years) : 1.036, or 3.6% 
        Average daily stats: 
            Average net revenue per investment: $0.19 
            Average investments per day: 2 
            Q75 investments per day: 2 
Calculating sumulation for prediction pred1_manual_prev_g1:
    Count times of investment 15601 out of 29829 TEST records
    Financial Result: 
 sim_gross_rev_pred1    6913.217124
sim_fees_pred1        -3120.200000
sim_net_rev_pred

Unnamed: 0,prediction,sim_count_investments,sim_gross_rev,sim_fees,sim_net_rev,sim_fees_percentage,sim_average_net_revenue,sim_avg_investments_per_day,sim_capital,sim_CAGR,sim_growth_capital_4y
0,pred0_manual_cci,799,309.04589,-159.8,149.84589,0.517075,0.187542,1.866822,1000.0,1.035523,1.149846
1,pred1_manual_prev_g1,15601,6913.217124,-3120.2,3812.817124,0.451338,0.244396,16.526483,11000.0,1.077237,1.34662
2,pred2_manual_prev_g1_and_snp,10455,4213.424501,-2091.0,2139.624501,0.496271,0.204651,21.918239,13000.0,1.038826,1.164587
3,pred3_manual_gdp_and_fastd,29822,14893.285316,-5964.4,8961.885316,0.400476,0.300513,31.491024,16500.0,1.114555,1.543145
4,pred4_manual_gdp_and_wti30d,0,0.0,0.0,0.0,,,,,,
5,pred5_clf_10,25644,15578.953119,-5128.8,10483.153119,0.329213,0.408796,31.311355,16500.0,1.130843,1.635343
6,pred6_clf_best,19828,14477.431798,-3965.6,10511.831798,0.273916,0.530151,24.509271,16500.0,1.131143,1.637081
7,pred7_clf_best_rule_66,8504,6977.651882,-1700.8,5276.851882,0.24375,0.620514,18.608315,16000.0,1.073858,1.329803
8,pred8_clf_best_rule_78,5419,4659.937647,-1083.8,3576.137647,0.232578,0.659926,15.891496,14500.0,1.056658,1.24663
9,pred9_rf_best_rule51,14315,10037.583629,-2863.0,7189.783629,0.285228,0.502255,15.730769,12000.0,1.124533,1.599149


In [20]:
print(f"The new best CAGR is {round(sim_results['sim_CAGR'].max(), 4)}")

The new best CAGR is 1.1376


## Question 2 (2 points): Less Features is More

**The idea**: Reduce the features list to help models perform better. You may have noticed in the lecture that some of the models (Logistic Regression, Deep Neural Network) are not trained well, likely due to too many features in the dataset. We know there were too many dummies (including ones from the exercise and individual stock dummies). In this task, you are asked to remove most of the dummies from the feature set.

1) **Define "DUMMIES_SHORT" set.** This set should include all dummies from the extended set (DUMMIES) but exclude all dummies starting from 'month_' (month_week_of_month dummies) and from 'Ticker' (individual ticker dummies). Make sure you leave dummies starting from 'Month_' (capital 'M'). To check yourself: `df_with_dummies[NUMERICAL+DUMMIES].info()` should give 299 features, while the new one `df_with_dummies[NUMERICAL+DUMMIES_SHORT].info()` should give only 206 features (32% less!).

2) **Define the correct features_list.** Use DUMMIES_SHORT when you generate train, validation, and test dataframes and true values.

3) **Run the workbook till the end.** Check the CAGR for pred5_clf_10 and write it down as an answer. It should be slightly higher than the original workbook CAGR for pred5_clf_10 (1.1308).
 
(Advanced): You should see that simulations on many model-based predictions (names `.._rf_best_..`, `.._clf_best_..`) deliver worse results, likely because the feature set is very different now and you need to re-run the hyperparameters tuning again for Decision Tree and Random Forest classifiers. You may even see that Logistic Regression and Neural Network start to train. You can find the new best models rf_best and clf_best, and apply the decision rules strategies to improve the results even more.

In [21]:
PREDICTIONS = [k for k in new_df.keys() if k.startswith('pred')]
PREDICTIONS_ON_MODELS = [p for p in PREDICTIONS if int(p.split('_')[0].replace('pred', ''))>=5]
new_df.drop(columns=PREDICTIONS_ON_MODELS)

# TODO HA4 Q2: Define DUMMIES_SHORT from DUMMIES -- all elements: without month_* and Ticker_*
DUMMIES_SHORT = [col for col in DUMMIES if not (col.startswith('month_') or col.startswith('Ticker_'))]

# TODO HA4 Q2: use correct dummies here : DUMMIES_SHOR
features_list = NUMERICAL + DUMMIES_SHORT

to_predict = 'is_positive_growth_5d_future'

X_train = train_df[features_list].copy()
X_valid = valid_df[features_list].copy()
X_train_valid = train_valid_df[features_list].copy()
X_test = test_df[features_list].copy()
# this to be used for predictions and join to the original dataframe new_df
X_all =  new_df[features_list].copy()

y_train = train_df[to_predict]
y_valid = valid_df[to_predict]
y_train_valid = train_valid_df[to_predict]
y_test = test_df[to_predict]
y_all =  new_df[to_predict]

print(f'length: X_train {X_train.shape},  X_validation {X_valid.shape}, X_test {X_test.shape}, X_train_valid = {X_train_valid.shape},  all combined: X_all {X_all.shape}')

length: X_train (123458, 206),  X_validation (29388, 206), X_test (29829, 206), X_train_valid = (152846, 206),  all combined: X_all (182675, 206)


In [22]:
# Clean from +-inf and NaNs:
X_train = clean_dataframe_from_inf_and_nan(X_train)
X_valid = clean_dataframe_from_inf_and_nan(X_valid)
X_train_valid = clean_dataframe_from_inf_and_nan(X_train_valid)
X_test = clean_dataframe_from_inf_and_nan(X_test)
X_all = clean_dataframe_from_inf_and_nan(X_all)

In [23]:
clf_10 = DecisionTreeClassifier(max_depth=10,
                                random_state=42)
clf_10.fit(X_train_valid, y_train_valid)
# predict on a full dataset
y_pred_all = clf_10.predict(X_all)
# defining a new prediction vector is easy now, as the dimensions will match
new_df['pred5_clf_10'] = y_pred_all

In [24]:
sim_df, sim_results = get_simulate_result(new_df)
sim_results

Calculating sumulation for prediction pred0_manual_cci:
    Count times of investment 799 out of 29829 TEST records
    Financial Result: 
 sim_gross_rev_pred0    309.04589
sim_fees_pred0        -159.80000
sim_net_rev_pred0      149.84589
dtype: float64
        Count Investments in 4 years (on TEST): 799
        Gross Revenue: $309
        Fees (0.2% for buy+sell): $159
        Net Revenue: $149
        Fees are 51 % from Gross Revenue
        Capital Required : $1000 (Vbegin)
        Final value (Vbegin + Net_revenue) : $1149 (Vfinal)
        Average CAGR on TEST (4 years) : 1.036, or 3.6% 
        Average daily stats: 
            Average net revenue per investment: $0.19 
            Average investments per day: 2 
            Q75 investments per day: 2 
Calculating sumulation for prediction pred1_manual_prev_g1:
    Count times of investment 15601 out of 29829 TEST records
    Financial Result: 
 sim_gross_rev_pred1    6913.217124
sim_fees_pred1        -3120.200000
sim_net_rev_pred

Unnamed: 0,prediction,sim_count_investments,sim_gross_rev,sim_fees,sim_net_rev,sim_fees_percentage,sim_average_net_revenue,sim_avg_investments_per_day,sim_capital,sim_CAGR,sim_growth_capital_4y
0,pred0_manual_cci,799,309.04589,-159.8,149.84589,0.517075,0.187542,1.866822,1000.0,1.035523,1.149846
1,pred1_manual_prev_g1,15601,6913.217124,-3120.2,3812.817124,0.451338,0.244396,16.526483,11000.0,1.077237,1.34662
2,pred2_manual_prev_g1_and_snp,10455,4213.424501,-2091.0,2139.624501,0.496271,0.204651,21.918239,13000.0,1.038826,1.164587
3,pred3_manual_gdp_and_fastd,29822,14893.285316,-5964.4,8961.885316,0.400476,0.300513,31.491024,16500.0,1.114555,1.543145
4,pred4_manual_gdp_and_wti30d,0,0.0,0.0,0.0,,,,,,
5,pred5_clf_10,25631,15693.743691,-5126.2,10600.543691,0.32664,0.413583,31.295482,16500.0,1.132071,1.642457
6,pred6_clf_best,19828,14477.431798,-3965.6,10511.831798,0.273916,0.530151,24.509271,16500.0,1.131143,1.637081
7,pred7_clf_best_rule_66,8504,6977.651882,-1700.8,5276.851882,0.24375,0.620514,18.608315,16000.0,1.073858,1.329803
8,pred8_clf_best_rule_78,5419,4659.937647,-1083.8,3576.137647,0.232578,0.659926,15.891496,14500.0,1.056658,1.24663
9,pred9_rf_best_rule51,14315,10037.583629,-2863.0,7189.783629,0.285228,0.502255,15.730769,12000.0,1.124533,1.599149


In [25]:
print(f"The CAGR for pred5_clf_10 is {round(sim_results.loc[sim_results['prediction'] == 'pred5_clf_10', 'sim_CAGR'].values[0], 4)}")

The CAGR for pred5_clf_10 is 1.1321


## Question 3 (2 points): Predicting Strong Future Growth


**The idea**: There are many "neutral" growth days (with very small positive or negative growth), which may be extremely hard to predict. Let's aim to predict strong positive growth and potentially remove some headache for the models.

1) **Define a new column for the strong growth.** Define a new field `is_strong_positive_growth_5d_future` in the original dataset `df_full`. It should equal to 1 only if the future growth is higher than 2% (`growth_future_5d` is higher than 1.02)

2) **Generate correct true value datasets.** Change the value of a variable `to_predict` to the new feature  `is_strong_positive_growth_5d_future`. Ensure that all vectors with true values (y_train, y_valid, y_test, y_all, y_train_valid) are based on the new feature to predict. 

3) **Run the workbook till the end.** Find the best CAGR for the model-based predictions.
You should see that the best performing simulation (for model-based predictions) should be for the prediction `pred6_clf_best`, although it should be lower than the one in the Workbook (1.1311).

Write down the new CAGR as a result. 

(Advanced): You should see that the trick with predicting strong growth didn't work, as it didn't improve the CAGR. What are the potential reasons for this? Are we missing some steps?

In [26]:
features_list = NUMERICAL + DUMMIES
to_predict = 'is_strong_positive_growth_5d_future'

train_df = new_df[new_df.split.isin(['train'])].copy()
valid_df = new_df[new_df.split.isin(['validation'])].copy()
train_valid_df = new_df[new_df.split.isin(['train','validation'])].copy()
test_df =  new_df[new_df.split.isin(['test'])].copy()

X_train = train_df[features_list].copy()
X_valid = valid_df[features_list].copy()
X_train_valid = train_valid_df[features_list].copy()
X_test = test_df[features_list].copy()
# this to be used for predictions and join to the original dataframe new_df
X_all =  new_df[features_list].copy()

y_train = train_df[to_predict]
y_valid = valid_df[to_predict]
y_train_valid = train_valid_df[to_predict]
y_test = test_df[to_predict]
y_all =  new_df[to_predict]

In [27]:
# Clean from +-inf and NaNs:
X_train = clean_dataframe_from_inf_and_nan(X_train)
X_valid = clean_dataframe_from_inf_and_nan(X_valid)
X_train_valid = clean_dataframe_from_inf_and_nan(X_train_valid)
X_test = clean_dataframe_from_inf_and_nan(X_test)
X_all = clean_dataframe_from_inf_and_nan(X_all)

In [28]:
clf_10 = DecisionTreeClassifier(max_depth=10,
                                random_state=42)
clf_10.fit(X_train_valid, y_train_valid)
# predict on a full dataset
y_pred_all = clf_10.predict(X_all)
# defining a new prediction vector is easy now, as the dimensions will match
new_df['pred5_clf_10'] = y_pred_all

In [29]:
best_depth = 15
clf_best = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
clf_best.fit(X_train_valid, y_train_valid)
# predict on a full dataset
y_pred_clf_best = clf_best.predict(X_all)
# defining a new prediction vector is easy now, as the dimensions will match
new_df['pred6_clf_best'] = y_pred_clf_best

In [30]:
# adding Decision Tree predictors (clf_best) to the dataset for 2 new rules: Threshold = 0.66 and 0.78
y_pred_all_prob = clf_best.predict_proba(X_all)[:,1]

# defining a new prediction vector is easy now, as the dimensions will match
# new_df['proba_pred7'] = y_pred_all_class1_array
new_df['pred7_clf_best_rule_66'] = (y_pred_all_prob >= 0.66).astype(int)

# new_df['proba_pred8'] = y_pred_all_class1_array
new_df['pred8_clf_best_rule_78'] = (y_pred_all_prob >= 0.78).astype(int)

In [31]:
rf_best = RandomForestClassifier(n_estimators = 200,
                                 max_depth = 17,
                                 random_state = 42,
                                 n_jobs = -1)
rf_best = rf_best.fit(X_train_valid, y_train_valid)

y_pred_all_prob = rf_best.predict_proba(X_all)[:,1]
for i, threshold in enumerate(np.arange(0.51, 0.61, 0.01), start=9):
    new_df[f"pred{i}_rf_best_rule{int(100*threshold)}"] = (y_pred_all_prob >= threshold).astype(int)

In [32]:
sim_df, sim_results = get_simulate_result(new_df)
sim_results

Calculating sumulation for prediction pred0_manual_cci:
    Count times of investment 799 out of 29829 TEST records
    Financial Result: 
 sim_gross_rev_pred0    309.04589
sim_fees_pred0        -159.80000
sim_net_rev_pred0      149.84589
dtype: float64
        Count Investments in 4 years (on TEST): 799
        Gross Revenue: $309
        Fees (0.2% for buy+sell): $159
        Net Revenue: $149
        Fees are 51 % from Gross Revenue
        Capital Required : $1000 (Vbegin)
        Final value (Vbegin + Net_revenue) : $1149 (Vfinal)
        Average CAGR on TEST (4 years) : 1.036, or 3.6% 
        Average daily stats: 
            Average net revenue per investment: $0.19 
            Average investments per day: 2 
            Q75 investments per day: 2 
Calculating sumulation for prediction pred1_manual_prev_g1:
    Count times of investment 15601 out of 29829 TEST records
    Financial Result: 
 sim_gross_rev_pred1    6913.217124
sim_fees_pred1        -3120.200000
sim_net_rev_pred

Unnamed: 0,prediction,sim_count_investments,sim_gross_rev,sim_fees,sim_net_rev,sim_fees_percentage,sim_average_net_revenue,sim_avg_investments_per_day,sim_capital,sim_CAGR,sim_growth_capital_4y
0,pred0_manual_cci,799,309.04589,-159.8,149.84589,0.517075,0.187542,1.866822,1000.0,1.035523,1.149846
1,pred1_manual_prev_g1,15601,6913.217124,-3120.2,3812.817124,0.451338,0.244396,16.526483,11000.0,1.077237,1.34662
2,pred2_manual_prev_g1_and_snp,10455,4213.424501,-2091.0,2139.624501,0.496271,0.204651,21.918239,13000.0,1.038826,1.164587
3,pred3_manual_gdp_and_fastd,29822,14893.285316,-5964.4,8961.885316,0.400476,0.300513,31.491024,16500.0,1.114555,1.543145
4,pred4_manual_gdp_and_wti30d,0,0.0,0.0,0.0,,,,,,
5,pred5_clf_10,3226,1059.242032,-645.2,415.442032,0.609115,0.128779,5.332231,3000.0,1.032955,1.138481
6,pred6_clf_best,6229,3523.912904,-1245.8,2281.912904,0.353527,0.366337,7.078409,4625.0,1.10546,1.493387
7,pred7_clf_best_rule_66,5102,2605.649634,-1020.4,1589.049634,0.391611,0.311456,6.306551,4000.0,1.087225,1.397262
8,pred8_clf_best_rule_78,4254,1272.805444,-850.8,425.605444,0.668445,0.100048,5.439898,3500.0,1.029105,1.121602
9,pred9_rf_best_rule51,99,249.999091,-19.8,230.199091,0.0792,2.325243,1.546875,1000.0,1.053159,1.230199


In [33]:
print(f"The new best CAGR (for model-based predictions) is {round(sim_results.loc[6:, 'sim_CAGR'].max(), 4)}")

The new best CAGR (for model-based predictions) is 1.1055


## Question 4 (1 point): [EXPLORATORY] Describe Your Ideal Strategy

We've seen many strategies to produce predictions and have made significant improvements, but we've only explored one trading strategy: 'Invest $100 in each positive prediction'. Think about your best strategy for trading on these predictions and describe it.

Consider how you would combine the predictions, better utilize your capital, achieve efficiency (fewer trades), and possibly make other optimizations (e.g., stop loss, timing of entry/selling, portfolio optimization, short trades for market-neutral strategies). Please write down the specific details of what you would use and how you would implement your strategy.

## Submitting the solutions

Form for submitting: https://courses.datatalks.club/sma-zoomcamp-2024/homework/hw04