In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
daily_df = pd.read_csv("Portfolio_data/Market_portfolio_daily.csv",index_col=0)
# monthly_df = pd.read_csv('Portfolio_data/Market_portfolio_monthly.csv',index_col=0)

In [3]:
daily_df.index = pd.to_datetime(daily_df.index)

In [4]:
monthly_df = daily_df.resample('M').last()

# Pipeline - Daily Data (To generate features for monthly dataset)

In [5]:
def hist_vol(X):
    X_resampled = pd.DataFrame()
    X_resampled['prev_month_volatility'] = X['Close'].resample('M').std()
    X_resampled['prev2_month_volatilty'] = X['Close'].resample('M').std().shift(1)
    X_resampled['vol_ratio'] = X_resampled['prev_month_volatility']/X_resampled['prev2_month_volatilty']
    return X_resampled

def log_range(X):
    X_resampled = pd.DataFrame()
    X_resampled['log_high_low'] = np.log(X['Close'].resample('M').max()) - np.log(X['Close'].resample('M').min())
    return X_resampled

In [6]:
daily_features = pd.concat([log_range(daily_df),hist_vol(daily_df)],axis=1)

In [7]:
daily_features

Unnamed: 0_level_0,log_high_low,prev_month_volatility,prev2_month_volatilty,vol_ratio
Date-Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999-11-30,0.094195,0.791459,,
1999-12-31,0.051067,0.301951,0.791459,0.381511
2000-01-31,0.072246,0.524966,0.301951,1.738581
2000-02-29,0.084725,0.619499,0.524966,1.180074
2000-03-31,0.092894,0.597744,0.619499,0.964884
...,...,...,...,...
2023-10-31,0.039563,2.200746,4.317303,0.509750
2023-11-30,0.083692,5.457335,2.200746,2.479766
2023-12-31,0.069859,5.807592,5.457335,1.064181
2024-01-31,0.042038,2.597588,5.807592,0.447275


# Pipeline - Monthly Data (dataframe for prediction)

In [8]:
class generate_class(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['Signal'] = (X['Close'] < X['Close'].shift(-1)).astype(int)
        return X

class log_change(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['log_close_diff'] = np.log(X['Close']) - np.log(X['Close'].shift(1))
        return X

class pct_change(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['1month_return'] = X['Close']/X['Close'].shift(1) - 1
        return X

In [9]:
monthly_df

Unnamed: 0_level_0,Close
Date-Time,Unnamed: 1_level_1
1999-11-30,22.558148
1999-12-31,22.355926
2000-01-31,22.072963
2000-02-29,20.177778
2000-03-31,21.628519
...,...
2023-10-31,209.060741
2023-11-30,230.086296
2023-12-31,247.812963
2024-01-31,250.343704


In [None]:
monthly_cols = monthly_df.columns

baseline_pipeline = Pipeline([
        ('generate_class', generate_class()),
        ('log_change', log_change()),
        ('pct_change', pct_change())
])

baseline_df = baseline_pipeline.fit_transform(monthly_df)
baseline_df = baseline_df.drop(monthly_cols,axis=1)



In [11]:
return_df = pd.concat([baseline_df, daily_features],axis=1).loc['2010-01-01':'2022-12-31',:]

In [13]:
return_df.to_csv('Intermediate_data/Baseline_jadon_v0.csv')