In [None]:
# !pip install python-dotenv>=0.5.1
# !pip install scikit-learn==1.2.2
# !pip install matplotlib==3.7.3
# !pip install wordcloud==1.9.2
# !pip install tensorflow==2.10.1
# !pip install opt_einsum==3.3.0
# !pip install gast==0.5.4
# !pip install astunparse==1.6.3
# !pip install h5py==3.9.0
# !pip install future==0.18.3
# !pip install openpyxl==3.1.2
# !pip install torch==2.0.1
# !pip install torchvision==0.15.2
# !pip install torchaudio==2.0.2
# !pip install pandas==1.1.0
# !pip install sage-importance
# !pip install shap==0.42.1
# !pip install shap==0.42.1
# !pip install Imbalanced-learn==0.11.0
# !pip install xgboost==2.0.0
# !pip install numpy==1.23.5
# !pip install torchmetrics==1.2.0
# !pip install lifelines==0.27.8
# !pip install joblib==1.3.2
# !pip install boruta

In [1]:
import os
directory_path = input("Enter your file directory: ")
os.chdir(directory_path)

from feature_selection_timeseries.src.models.pipeline import run
from feature_selection_timeseries.src.models.utils import create_time_feature, tune_cv_split, convert_to_sample 
from datetime import datetime
import numpy as np
import pandas as pd
import warnings
import csv

warnings.filterwarnings("ignore", category=FutureWarning)

### Create Sample Data (Filter for data with the date specified)

##### The 11 stocks file can be downloaded via the URL link below (request for access required):

##### https://drive.google.com/drive/folders/1yN-JTu9pvL8Tm2xTSiK5L8F_L1tIWxoQ?usp=sharing

##### After completing the downloads, add the files to the directory: ./feature_selection_timeseries/data/raw/sp500_subset/'

In [2]:
year = "2006"
date_threshold = year + '-01-01'  # filter for data with date >=
sub_folder = "sp500_subset"

# path of the file containing the features
x_filename_in = 'stock_x.csv'
x_filename_out = f'stock_x_sample_regression_{year}_filtered_11_stocks.csv'
x_in_path = f'./feature_selection_timeseries/data/raw/{sub_folder}/'
x_out_path = f'./feature_selection_timeseries/data/raw/{sub_folder}/'

# path of the file containing the label
y_filename_in = 'stock_y_ret84.csv'
y_filename_out = f'stock_y_ret84_sample_regression_{year}_filtered_11_stocks.csv'
y_in_path = x_in_path
y_out_path = x_out_path

### Filter Data and Generate New Files

#### Run the codes below if you want to filter the original dataset for a selected subset of stocks

In [3]:
# stocks=[
#     "AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA"
# ]

# convert_to_sample(
#     path_in=x_in_path,
#     path_out=x_out_path,
#     filename_in = x_filename_in,
#     filename_out = x_filename_out,
#     date_threshold=date_threshold,
#     filter_type="combined stocks",
#     stocks=stocks
# )

# convert_to_sample(
#     path_in=y_in_path,
#     path_out=y_out_path,
#     filename_in = y_filename_in,
#     filename_out = y_filename_out,
#     date_threshold=date_threshold,
#     filter_type="combined stocks",
#     stocks=stocks
# )

### Import Sample Data from the Files with the Generated Subset of Features

In [4]:
# Specify the path to your CSV file
x_path = f'{x_in_path}{x_filename_out}'
y_path = f'{y_in_path}{y_filename_out}'
# Import data
sp500_df = pd.read_csv(x_path)
y = pd.read_csv(y_path)
# Adjust labels
y['target'] = y['ret_fwd_84']
# Create additional time features
sp500_df = create_time_feature(sp500_df)
# Combine features and target
sp500_df = pd.concat([sp500_df.iloc[:, 1:], y['target']], axis=1)

In [5]:
# sp500_df = sp500_df[sp500_df['ticker'] == 'ADBE']
# y = y[y['ticker'] == 'ADBE']

In [6]:
# sp500_df = sp500_df.iloc[:1000, -100:]
# y = y.iloc[:1000, -100:]

In [None]:
display(y.head())
print(f"Dataset Shape: {np.shape(y)}")

In [None]:
display(sp500_df.head())
print(f"Dataset Shape: {np.shape(sp500_df)}")

### Initialize Pipeline

In [None]:
# Possible train validation splits
train_test_list = [tune_cv_split(
    sp500_df.iloc[-np.shape(sp500_df)[0]:,:],
    val_test_prop_constraint = 0.2, # Size of validation set relative to the train set
    num_split_constraint = 5 # Number of splits
)[-1]]

keep_data_index = train_test_list[0][0]*train_test_list[0][2] + 2*train_test_list[0][1]
print(f"\nUsing Split: {train_test_list}")

In [10]:
r1 = run(
    cross_validation_type= "moving window", # or "expanding window"
    save_output_file = True, # Whether to save test outputs
    raw_df = sp500_df.iloc[-keep_data_index:, :].reset_index(drop=True), # Discard extra data instances from the beginning of the time series rather than the end
    y = y.iloc[-keep_data_index:, :].reset_index(drop=True), # Discard extra data instances from the beginning of the time series rather than the end
    train_test_list = train_test_list, # A list of possible list of train and validation size, and number of splits
    methods = ["xgboost", "permutation", "shap", "lasso", "cart"], # Available methods: ["xgboost", "cae", "permutation", "shap", "boruta", "sage", "lasso", "cart", "svm", "rf", "stg", "dynamic"]
    rebalance_type = ["None"], # ["borderlinesmote", "smoten", "random_over_sampler", "smote", "None"]
    label_cols = [], # Columns to label encode
    do_not_encode_cols = ["dayofmonth", "dayofweek", "quarter", "month", "year", "dayofyear", "weekofyear"], # These fields are not transformed
    seed = 42,
    target_colname = "target", # The name of the field that holds the true values
    dataset_name = "sp500",
    pred_type = "regression",
    append_to_full_df = False,
    n_features = 50,  # The number of top features to filter for
    feature_direction = "top", # Feature order based on their scores in descending order
    train_outputs_file_name = None,
    current_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
    scaler_filename = "./feature_selection_timeseries/data/processed/scaler_saved.save",
    encoder_filename = "./feature_selection_timeseries/data/processed/encoder_saved.save",
    label_encoder_filename = "./feature_selection_timeseries/data/processed/lalbel_encoder_saved.save",
    test_output_file_name = f"./feature_selection_timeseries/data/experiment/Consolidated_Stocks_FS_timeseries_sp500_outputs_test_results_",
    test_pred_file_name = f"./feature_selection_timeseries/data/experiment/Consolidated_Stocks_FS_timeseries_sp500_outputs_test_preds_",
    print_outputs_train = False,
    print_outputs_test = True
)

#### Train model

In [None]:
r1.train()

#### Test on holdout test data and generate testing outputs

In [None]:
r1.test()