In [152]:
# Final Revision of Capstone Modeling Process

In [153]:
# Modeling Template File Notes:
# File Sequence #7

# the purpose of this notebook is to

# input:
# 1. For a given Fold:
## A. Labeled Seen Dataset
## B. Labeled Unseen Dataset
# 2. User Model Selection
# 3. User Feature Set Selection (if Fold 1)
# 4. Desired predicted field (y-val)

# output:
# 1. Testing Dataset with Predicted Labels and Actual Labels
# (Per ticker, Per day)

In [154]:
# mount google drive
from google.colab import drive
# this resets all imported csvs
drive.flush_and_unmount()
# mount/remount
drive.mount('/content/drive')

Mounted at /content/drive


In [155]:
# library imports

# data handling and calcs
import pandas as pd
# calculations and numbers
import numpy as np
# file handling:
import os
# hyperparameter search for validation fold
# (created a custom search grid function instead)
#from sklearn.model_selection import GridSearchCV
# creation of combination dict for validation fold
from itertools import product
# scoring validation fold:
# (validation fold benchmarked by overall accuracy)
from sklearn.metrics import accuracy_score
# clone is necessitated by the validation loop:
# (need to reset model back to base after fitting with each param combo)
from sklearn.base import clone
# using csv to extract headers only from csv file
import csv
# for file name incrementing
from datetime import datetime
# supress filters
import warnings
warnings.filterwarnings("ignore")

In [156]:
# model selection library imports

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# Niave Bayes
from sklearn.naive_bayes import GaussianNB
# MLP Classifier
from sklearn.neural_network import MLPClassifier



In [157]:
# fold selection cell

# define shared drive location
shared_drive = r"/content/drive/MyDrive/Capstone_Docs_Shared"

active_fold = "1-Train"
#active_fold = "2-Validate"
#active_fold = "3-Test"

# use engineered features for train fold?
engineered_features = True
#engineered_features = False

# (Optional: ticker list subset)
# (running them all takes a long time...
# ...if you know which ones you would like to filter...
# ...you can choose those now)
#selected_ticker_list = []

# define desired model for this NB run:
#selected_model = "RandomForestClass"
selected_model = "KNN"
#selected_model = "NaiveBayesGauss"
#selected_model = "DecisionTree"
#selected_model = "MLP"

# define desired prediction value:
predicted_feature = 'D_5_Label'
# add more here if desired



In [158]:
# store model configurations for grid search in validation fold
# (if no selections, use defaults...)
# (note: this is a subselection of parameter options chosen after...
# ... some experimentation to make the project demo NB run faster...
# ... older, wider, parameter selections have been discarded)

model_config_dict = {
    # user selection definition
    "RandomForestClass" : {
        # actual model from map
        "model" : RandomForestClassifier(random_state=42),
        # add parameters to tune
        "params" : {
            "n_estimators" : [5, 50,],
            "max_depth" : [10, 15,],
            "class_weight" : ["balanced", "balanced_subsample"],
            #"max_features" : ["sqrt", "log2", None],
            #"min_samples_split" : [2, 5, 10],
            #"min_samples_leaf" : [1, 2, 4],
        }
    },
    # next user selection definition
    "KNN" : {
        "model" : KNeighborsClassifier(),
        "params" : {
            "n_neighbors" : [18, 21, 24],
            "weights" : ["uniform", "distance"],
            #"weights" : ["uniform"],
            #"algorithm" : ["auto", "ball_tree"],
            "leaf_size" : [10, 30],
            #"p" : [1, 2],
        }
    },
    # thid user selection definition
    "NaiveBayesGauss" : {
        "model" : GaussianNB(),
        "params" : {
            "var_smoothing" : [1e-9, 1e-8, 1e-7, 1e-6, 1e-5],
        }
    },
    # user selection:
    "MLP" : {
        "model" : MLPClassifier(random_state=42),
        "params" : {
            "hidden_layer_sizes" : [(50, 50), (100, 100)],
            "activation" : ["relu"],
            "solver" : ["adam"],
            "alpha" : [0.0001, 0.001, 0.01],
            "learning_rate" : ["constant", "adaptive"],
            "max_iter" : [100, 200],
            "batch_size" : [32, 16, 8],
            "early_stopping" : [True],
        }
    }
}




In [159]:
# Load data using fold selection

def load_data(fold_selection, use_engineered_features, shared_loc):

  # ID filenames based on fold selection and flags
  if fold_selection == "1-Train":
    # if flag for engineered features = True
    if use_engineered_features:
      train_data_name = "train_fold_1_fe.csv"
      test_data_name = "test_fold_1_fe.csv"
      print("Selected: Fold 1, Engineered Features")
    else:
      train_data_name = "train_fold_1.csv"
      test_data_name = "test_fold_1.csv"
      print("Selected: Fold 1, All Features")
  elif fold_selection == "2-Validate":
    train_data_name = "train_fold_2.csv"
    test_data_name = "test_fold_2.csv"
    print("Selected: Fold 2")
  elif fold_selection == "3-Test":
    train_data_name = "train_fold_3.csv"
    test_data_name = "test_fold_3.csv"
    print("Selected: Fold 3")
  else:
    print("Invalid fold selection")

  # Based on selected filenames, assemble train/test split filepaths
  train_data_path = os.path.join(shared_loc, train_data_name)
  test_data_path = os.path.join(shared_loc, test_data_name)
  print("Data paths assembled")
  print(train_data_path)
  print(test_data_path)

  # make acutal data load based on assembled path
  train_df = pd.read_csv(train_data_path)
  test_df = pd.read_csv(test_data_path)
  print("Data load complete")
  print("")
  print('FOLD DATA STATS:')
  print("Training Split Null Values:")
  print(train_df.isna().sum().sum())
  print("Testing Split Null Values:")
  print(test_df.isna().sum().sum())
  print("")
  print("Column Check:")
  print(train_df.columns)
  print(test_df.columns)


  return train_df, test_df


In [160]:
from re import sub
# Function to check for duplicate days

def check_duplicate_days(input_df):
  # take a copy
  df = input_df.copy()
  # drop duplicate days from df
  dupes = df.duplicated(subset=['Date','Ticker'], keep='first')
  dup_count = dupes.sum()
  if dup_count > 0:
    df = df.drop_duplicates(subset=['Date','Ticker'], keep='first')
  return df




In [161]:
# Function to check for Unnamed Col
# (This sometimes happens as dfs are saved to csv)
# (Including this in case instructional team wants to mess around with inputs)

def check_for_unnamed_cols(input_df):
  # take a copy
  df = input_df.copy()
  # ID unnamed cols:
  unnamed_cols = [col for col in df.columns if 'Unnamed' in col]
  # remove unnamed cols
  df = df.drop(columns=unnamed_cols)
  clean_df = df
  print("Unnamed cols removed")
  return clean_df


In [162]:
# function to assemble and track a filtered list of tickers while modeling

def assemble_ticker_list(input_df, num_gens):
  df = input_df.copy()
  #print(df)
  # split tickers amongst selected subslputs
  ticker_lists = np.array_split(df['Ticker'].unique(), num_gens)

  return ticker_lists

In [163]:
# function to filter input df by ticker

def filter_by_ticker(input_df, ticker_lst):
  df = input_df.copy()
  df = df[df['Ticker'].isin(ticker_lst)]
  # remember to reindex:
  df = df.reset_index(drop=True)
  print("Number of Tickers In Selected Data Split:")
  print(df['Ticker'].nunique())
  print("Number of Days In Input Df:")
  print(len(df) // df['Ticker'].nunique())
  return df



In [164]:
# function to remove unused prediction columns from dfs
def remove_unused_labels(input_df, pred_col):
  df = input_df.copy()
  # if column has "label" in text, remove, ...
  # except if pred_col
  drop_cols = [col for col in df.columns if 'Label' in col and col != pred_col]
  df = df.drop(columns=drop_cols)
  # also drop close column as this is a predictive column
  drop_cols = [col for col in df.columns if 'Close' in col and col != pred_col]
  df = df.drop(columns=drop_cols)

  return df

In [165]:
# function to ID 'Offset' Columns, remove thier non-offset counterparts...
# (necessary to prevent data leakage)

def remove_leakage_vars(input_df):
  df = input_df.copy()
  # find Offset Columns
  offset_cols = [col for col in df.columns if '_Offset' in col]
  # find non-offset counterparts
  non_offset_cols = [col.replace('_Offset', '') for col in offset_cols]
  # drop non-offset columns
  df = df.drop(columns=non_offset_cols)
  print(df.columns)
  return df

In [166]:
# function to filter val dataset and test dataset...
# ... based on columns present in F.E. Trian dataset
def filter_fe_cols(input_df, fold_select):
  df = input_df.copy()

  if fold_select == "1-Train":
    # determine fe file name (train selection, feature extracted)
    df = df
  elif fold_select == "2-Validate" or "3_Train":
    # determine fe file name (train selection, feature extracted)
    filtered_col_filename = "train_fold_1_fe.csv"
    # make filepath based on name
    filtered_col_filepath = os.path.join(shared_drive, filtered_col_filename)
    # read only the first row (header) of columns source file
    with open(filtered_col_filepath, newline='') as csvfile:
      reader = csv.reader(csvfile)
      first_row = next(reader)
    # apply filter to df:
    df = df[first_row]

  return df

In [167]:
# function for validation stage custom parameter search:
def custom_param_search(base_model, params_options, train_set, test_set, pred_feat):

  # filter down to even less tickers for sake of time:
  # ##### Do that here #####
  # (even 1 ticker technically provides like 200 tests per run)
  u_tickers = train_set['Ticker'].unique()

  # take first ticker from list:
  one_ticker = u_tickers[[0]]
  # filter train and test set by selected ticker
  train_set = train_set[train_set['Ticker'].isin(one_ticker)]
  test_set = test_set[test_set['Ticker'].isin(one_ticker)]

  # log best score =
  best_score = -1
  # log  best params
  best_params = None
  # log results of each param combo:
  param_scores = []

  # A. assemble param combos based on input grid:
  # get key , value parings first:
  keys, vals = zip(*params_options.items())
  # assemble dict obj from k/v paris
  param_combos = [dict(zip(keys, v)) for v in product(*vals)]

  print(f"Number of Parameter Combinations for {base_model}: ", len(param_combos))

  # Replicate signal prediction function here, except with each param combo:

  # initialize list to store predicted signals for each ticker
  ticker_predicted_signals = []
  # initialize list to store predicted prices for all tickers
  all_predicted_signals = []
  # take a copy of the input dfs
  train_data = train_set.copy()
  test_data = test_set.copy()

  # define X, y, and tracking features

  # tracking features (exclude from model, re-merge later)
  analysis_feats = ['Date', 'Ticker']
  # remove analysis features...
  filtered_features = [x for x in train_data.columns if x not in analysis_feats]
  # remove predicted features...
  # ... and you are left with X features
  model_X_feats = [x for x in filtered_features if x != pred_feat]

  # B. initialize loop to iterate over tickers
  for ticker_symbol in train_data['Ticker'].unique():
    print("Ticker Selected for Validation: ", ticker_symbol)
    # Filter input dfs to isolate ticker[i] time series
    ticker_train_df = train_data[train_data['Ticker'] == ticker_symbol]
    ticker_test_df = test_data[test_data['Ticker'] == ticker_symbol]
    # remember to reindex
    ticker_train_df = ticker_train_df.reset_index(drop=True)
    ticker_test_df = ticker_test_df.reset_index(drop=True)
    # filter X by selected features
    train_data_X = ticker_train_df[model_X_feats]
    test_data_X = ticker_test_df[model_X_feats]
    # filter y by predicted feature
    train_data_y = ticker_train_df[pred_feat]
    test_data_y = ticker_test_df[pred_feat]

    # C. loop over parameter combinations:
    for params in param_combos:
      print("--------------- New Parameter Selection Initiated ---------------")
      print("Parameter Combination Selected: ")
      print(params)
      # track pred vs actual at NB lvl
      # unfortunately need to do evaluation in the modeling notebook for validation stage to save time, memory, files, etc.
      preds = []
      acts = []

      # need to use clone here...
      # Resets model back to clean slate after previous param combo load
      model = clone(base_model).set_params(**params)

      # replicate rolling training set
      extended_train_data_X = train_data_X.copy()
      extended_train_data_y = train_data_y.copy()

      # replicate daily looping
      for eval_day in range(len(test_data_X)):
        # replicate x and y slicing
        curr_test_X = test_data_X.iloc[[eval_day]]
        curr_test_y = test_data_y.iloc[eval_day]
        # fit on current training slice:
        model.fit(extended_train_data_X, extended_train_data_y)
        # predict on current test slice:
        y_pred = model.predict(curr_test_X)[0]
        # (try removing from list here^^^)
        # append day's prediction to list
        preds.append(y_pred)
        # append day's actual to list
        acts.append(curr_test_y)
        # replicate extending training set to overlap new day
        extended_train_data_X = pd.concat([extended_train_data_X, curr_test_X], ignore_index=True)
        extended_train_data_y = pd.concat([extended_train_data_y, pd.Series([curr_test_y])], ignore_index=True)

      # Scoring -- use as basis of results tracking
      score = accuracy_score(acts, preds)
      param_scores.append((params, score))
      print("Score Metric for Selected Combo:", score)

      # update best scores and parameters, given logged score
      if score > best_score:
        best_score = score
        best_params = params
        print("New Best Score: ", best_score)
        print("New Best Parameters: ", best_params)

  print("Model HP Tuning Complete! YeeHaw")
  return best_params, param_scores


In [168]:
# function to determine model parameter selection:
def parameter_selection(model_name, model_config_dict, fold_selection, train_set, test_set, pred_feat):
  # determine what parameters to select based on active fold
  if fold_selection == "1-Train":
    print("Selected: Fold 1")
    params_to_use = None
  elif fold_selection == "2-Validate":
    # pass dict values from model selection here
    print("Selected: Fold 2")
    # determine base model type
    val_model = model_config_dict[model_name]['model']
    # determine parameter grid for given model type
    params_options = model_config_dict[model_name]['params']
    # typical next step, set up grid-search grid object
    # (BUT.. grid search doesn't really work with looping modeling/expanding seen data)
    # (need to set up custom grid search instead:)
    best_params, param_scores = custom_param_search(val_model, params_options, train_set, test_set, pred_feat)
    params_to_use = best_params

  elif fold_selection == "3-Test":
    # Load In --- best parameters as determined by Val fold
    print("Selected: Fold 3")
    # load parameter set from best_params doc and convert to dict for test modeling
    file_name = f'best_params_{model_name}_val.csv'
    file_path = os.path.join(shared_drive, file_name)
    param_df = pd.read_csv(file_path)
    # (this is importing ints as strings, because there is a mixture of...
    # data types in the same col... sends all to str)

    # helper function to convert datatypes:
    def change_type(val):
      try:
        return eval(val)
      except:
          return val

    param_dict = {
        row['params']: change_type(row['selection_values']) for _, row in param_df.iterrows()
    }
    print(param_dict)
    params_to_use = param_dict

  else:
    print("Invalid fold selection")

  return params_to_use


In [169]:
# function to run selected modeling scenario:
# (this model is focused on direct signal prediction, ...
# rather than point price prediction and translation to signals)

def daily_signal_prediction(model_name, model_detail_dict, parameters, train_split, test_split, predict_feat, fold_selection, use_engineered_features):
  # optional code included to log multi-ticker (based on unique tickers passed thru train_split)
  # OR (project demo version -->) one ticker at a time (len(u_t)=1)
  # (i.e. if you have more processing power, you can run this with many tickers in seen/unseen_splits)

  # initialize list to store predicted signals for each ticker
  ticker_predicted_signals = []
  # initialize list to store predicted prices for all tickers
  all_predicted_signals = []
  # take a copy of the input dfs
  train_data = train_split.copy()
  test_data = test_split.copy()
  # end df :
  pred_df = None

  # define X, y, and tracking features

  # tracking features (exclude from model, re-merge later)
  analysis_feats = ['Date', 'Ticker']
  # remove analysis features...
  filtered_features = [x for x in train_data.columns if x not in analysis_feats]
  # remove predicted features...
  # ... and you are left with X features
  model_X_feats = [x for x in filtered_features if x != predict_feat]

  # initialize loop to iterate over tickers
  for ticker_symbol in train_data['Ticker'].unique():
    # Filter input dfs to isolate ticker[i] time series
    ticker_train_df = train_data[train_data['Ticker'] == ticker_symbol]
    ticker_test_df = test_data[test_data['Ticker'] == ticker_symbol]
    # remember to reindex
    ticker_train_df = ticker_train_df.reset_index(drop=True)
    ticker_test_df = ticker_test_df.reset_index(drop=True)
    # filter X by selected features
    train_data_X = ticker_train_df[model_X_feats]
    test_data_X = ticker_test_df[model_X_feats]
    # filter y by predicted feature
    train_data_y = ticker_train_df[predict_feat]
    test_data_y = ticker_test_df[predict_feat]

    # iterate through days...
    # ... make prediction on every day of unseen df...
    # ... using a growing training set

    # initialize growing training set, forked from input training set
    extended_train_data_X = train_data_X.copy()
    # confirm all column names are string formatted
    extended_train_data_X.columns = extended_train_data_X.columns.astype(str)
    extended_train_data_y = train_data_y.copy()

    # initialize loop to iterate over testing days
    for eval_day in range(len(ticker_test_df)):
      # Split Rolling Train/Test Sets:
      # (going by index, so no need to define date ranges)
      # take current X test slice:
      # (turning this into a mini df makes extension easier later)
      curr_test_X = test_data_X.iloc[[eval_day]]
      # confirm all column names are string formatted
      curr_test_X.columns = curr_test_X.columns.astype(str)
      # take current y test slice
      curr_test_y = test_data_y.iloc[eval_day]
      # (c_t_y = act_label)
      # pull date based on index for logging:
      curr_test_date = ticker_test_df.iloc[eval_day]['Date']

      # Training / Fitting Model Stage:
      # (fork here for train, val, test)
      # (train, train_s_f, and test should all operate similarly...)
      # (train uses default params)
      # (val uses grid search cv)
      # (test should use params selected in val)

      if fold_selection == "1-Train":

        if use_engineered_features:
          # fit model with all default parameters to gauge f.e. performance
          model = model_detail_dict[model_name]['model']
          model.fit(extended_train_data_X, extended_train_data_y)
        else:
          # fit model with all default parameters to gauge baseline performance
          model = model_detail_dict[model_name]['model']
          model.fit(extended_train_data_X, extended_train_data_y)

        # Predict selected y_feat on Nth Day:
        y_pred = model.predict(curr_test_X)
        # Append predictions to results list:
        ticker_predicted_signals.append((curr_test_date, ticker_symbol, y_pred[0], curr_test_y))
        # extend training set to overlap most recent test
        extended_train_data_X = pd.concat([extended_train_data_X, curr_test_X], ignore_index=True)
        # keep track of running actual ys
        extended_train_data_y = pd.concat([extended_train_data_y, pd.Series([curr_test_y])], ignore_index=True)
        # (don't actually use, but in there for tracking and running results visualization)
        # confirm all column names are string formatted
        extended_train_data_X.columns = extended_train_data_X.columns.astype(str)

      elif fold_selection == "2-Validate":

        # take parameters generated by validation loop
        base_model = model_detail_dict[model_name]['model']
        ideal_parameters = parameters
        # (best parameters are already generated in..
        #... parameter_selection(custom_ranking())...
        # passed here thru 'parameters' input)
        model = base_model.set_params(**ideal_parameters)
        #print(model)
        model.fit(extended_train_data_X, extended_train_data_y)
        # Predict selected y_feat on Nth Day:
        y_pred = model.predict(curr_test_X)
        # Append predictions to results list:
        ticker_predicted_signals.append((curr_test_date, ticker_symbol, y_pred[0], curr_test_y))
        # extend training set to overlap most recent test
        extended_train_data_X = pd.concat([extended_train_data_X, curr_test_X], ignore_index=True)
        # keep track of running actual ys
        extended_train_data_y = pd.concat([extended_train_data_y, pd.Series([curr_test_y])], ignore_index=True)
        # (don't actually use, but in there for tracking and running results visualization)
        # confirm all column names are string formatted
        extended_train_data_X.columns = extended_train_data_X.columns.astype(str)

      elif fold_selection == "3-Test":

        #print("Modeling: Fold 3, Testing Fold")
        base_model = model_detail_dict[model_name]['model']
        # use parameters passed from fold 2
        validated_parameters = parameters
        model = base_model.set_params(**validated_parameters)
        model.fit(extended_train_data_X, extended_train_data_y)
        y_pred = model.predict(curr_test_X)
        ticker_predicted_signals.append((curr_test_date, ticker_symbol, y_pred[0], curr_test_y))
        extended_train_data_X = pd.concat([extended_train_data_X, curr_test_X], ignore_index=True)
        extended_train_data_y = pd.concat([extended_train_data_y, pd.Series([curr_test_y])], ignore_index=True)
        extended_train_data_X.columns = extended_train_data_X.columns.astype(str)

      else:
        print("Invalid fold selection")

    # extend results to output df
    all_predicted_signals.extend(ticker_predicted_signals)
    print("TICKER COMPLETED: ", ticker_symbol)

  # convert predictions to df
  pred_df = pd.DataFrame(all_predicted_signals, columns=["Date","Ticker",f"Predicted_Signal_{predict_feat}",f"Actual_Signal{predict_feat}"])

  return pred_df



In [170]:
# execute Train/Test Split based on Fold selection:
# AKA: load Seen/Unseen Split based on Fold selection:
fold_train_df, fold_test_df = load_data(active_fold, engineered_features, shared_drive)
# (should have no null values a this point)
# (column check allows user to view imported data columns)

Selected: Fold 1, Engineered Features
Data paths assembled
/content/drive/MyDrive/Capstone_Docs_Shared/train_fold_1_fe.csv
/content/drive/MyDrive/Capstone_Docs_Shared/test_fold_1_fe.csv
Data load complete

FOLD DATA STATS:
Training Split Null Values:
0
Testing Split Null Values:
0

Column Check:
Index(['Date', 'Ticker', 'Close', 'D_Return', 'W_Return', 'M_Return',
       'D_Return_Offset', 'W_Return_Offset', 'M_Return_Offset', 'D_2_Label',
       'W_2_Label', 'M_2_Label', 'D_3_Label', 'W_3_Label', 'M_3_Label',
       'D_5_Label', 'W_5_Label', 'M_5_Label', 'Open', 'High', 'Low', 'Volume',
       'UMCSENT', 'HOUST', 'RSXFS', 'ICSA', 'T10Y2Y', 'PPIACO', 'ADXDNO',
       'Year', 'RSI_5', 'RSI_7', 'RSI_14', 'RSI_21', 'RSI_30', 'RSI_50',
       'Bollinger_Lower_10', 'Bollinger_Upper_100', 'Bollinger_Lower_100',
       'Stoch_%K_5', 'Stoch_%K_7', 'Stoch_%K_14', 'Stoch_%K_30', 'ADX_7',
       'ADX_14', 'ADX_20', 'ADX_30', 'ADX_50', 'ROC_7', 'ROC_14', 'ROC_30',
       'Williams_R_[14]', 'Ichimo

In [171]:
# make copies of imported splits
train_df = fold_train_df.copy()
test_df = fold_test_df.copy()

# check for duplicate days:
train_df = check_duplicate_days(train_df)
test_df = check_duplicate_days(test_df)

# filter by feature extracted features:
train_df = filter_fe_cols(train_df, active_fold)
test_df = filter_fe_cols(test_df, active_fold)

# check for unnamed columns
train_df = check_for_unnamed_cols(train_df)
test_df = check_for_unnamed_cols(test_df)

# print unique tickers from train df
print("Number of Tickers In Seen Data:")
print(train_df['Ticker'].nunique())

# assemble lists to split tickers into more managable sections:
ticker_lists = assemble_ticker_list(train_df, 10)
# (subselection carries through to later docs)
# Take a smaller subsample
selected_ticker_list = ticker_lists[0]
#print(ticker_list)
# In the interest of time, take an even smaller subsample
subselected_ticker_list = selected_ticker_list[:5]
print("Tickers Selected For Modeling: ")
print(subselected_ticker_list)

# filter input by selected tickers:
#train_df = filter_by_ticker(train_df, selected_ticker_list)
train_df = filter_by_ticker(train_df, subselected_ticker_list)
#test_df = filter_by_ticker(test_df, selected_ticker_list)
test_df = filter_by_ticker(test_df, subselected_ticker_list)

# need to remove non-utilized target columns from df input:
train_df = remove_unused_labels(train_df, predicted_feature)
test_df = remove_unused_labels(test_df, predicted_feature)

# ID offset columns, remove their non-offset counterparts before predictions:
train_df = remove_leakage_vars(train_df)
test_df = remove_leakage_vars(test_df)

# pre-modeling column check:
print(train_df.columns)



Unnamed cols removed
Unnamed cols removed
Number of Tickers In Seen Data:
676
Tickers Selected For Modeling: 
['AA' 'RF' 'BRO' 'HOG' 'SCI']
Number of Tickers In Selected Data Split:
5
Number of Days In Input Df:
411
Number of Tickers In Selected Data Split:
5
Number of Days In Input Df:
411
Index(['Date', 'Ticker', 'D_Return_Offset', 'W_Return_Offset',
       'M_Return_Offset', 'D_5_Label', 'UMCSENT', 'HOUST', 'RSXFS', 'ICSA',
       'T10Y2Y', 'PPIACO', 'ADXDNO', 'Year', 'RSI_5', 'RSI_7', 'RSI_14',
       'RSI_21', 'RSI_30', 'RSI_50', 'Bollinger_Lower_10',
       'Bollinger_Upper_100', 'Bollinger_Lower_100', 'Stoch_%K_5',
       'Stoch_%K_7', 'Stoch_%K_14', 'Stoch_%K_30', 'ADX_7', 'ADX_14', 'ADX_20',
       'ADX_30', 'ADX_50', 'ROC_7', 'ROC_14', 'ROC_30', 'Williams_R_[14]',
       'Ichimoku_B', 'Open_Offset', 'Low_Offset', 'High_Offset',
       'Volume_Offset'],
      dtype='object')
Index(['Date', 'Ticker', 'D_Return_Offset', 'W_Return_Offset',
       'M_Return_Offset', 'D_5_Label', '

In [172]:
# determine parameters to use based on fold selection
parameters = parameter_selection(selected_model, model_config_dict, active_fold, train_df, test_df, predicted_feature)

# if validation operations in fold 2...
# ...convert best params and scores to df for export
if active_fold == "2-Validate":
  param_scores_df = pd.DataFrame([
      {'params':str(p), 'selection_values':s} for p, s in parameters.items()
  ])
  print(param_scores_df)
  # assemble filename for best params export
  best_params_filename = f'best_params_{selected_model}_val.csv'
  # assemble filepath for best params export
  best_params_filepath = os.path.join(shared_drive, best_params_filename)
  # save best params to csv
  param_scores_df.to_csv(best_params_filepath, index=False)
  print("Validation Best Parameters File Generated")


Selected: Fold 1


In [173]:
# function to make an output path

def assemble_output_path(fold_selection, use_engineered_features, shared_drive, sel_parameters, model_str, pred_feat_str):
  # ID filenames based on fold selection and flags
  if fold_selection == "1-Train":
    # if flag for engineered features = True
    if use_engineered_features:
      # assemble filename:
      output_filename = f"predictions_{pred_feat_str}_{model_str}_fold_1_fe.csv"
      print("Output File Generating: Fold 1, Engineered Features")
    else:
      # assemble filename:
      output_filename = f"predictions_{pred_feat_str}_{model_str}_fold_1_all.csv"
      print("Output File Generating: Fold 1, All Features")

  elif fold_selection == "2-Validate":
    # assemble filename
    output_filename = f"predictions_{pred_feat_str}_{model_str}_fold_2.csv"
    print("Output File Generating: Fold 2")

  elif fold_selection == "3-Test":
    # assemble filename
    # requires only one output, so easy to name based on model and fold
    output_filename = f"predictions_{pred_feat_str}_{model_str}_fold_3.csv"
    print("Output File Generating: Fold 3")

  else:
    print("Invalid fold selection")

  # assemble output filepath:
  output_filepath = os.path.join(shared_drive, output_filename)
  print("Output filepath assembled")
  print(output_filepath)

  return output_filepath


In [174]:
# function to run modeling one ticker at a time, push updates to csv as they finish:
def iterate_and_export_per_ticker(out_filepath, model_select, model_config_dict_name, parameters, train_df, test_df, pred_feat, fold_selection, use_engineered_features):
  # lists of tickers available for export
  # (after all previous filtering)
  final_ticker_lst = train_df['Ticker'].unique()
  print(final_ticker_lst)

  print(train_df['Ticker'].value_counts())
  print(train_df['Ticker'].duplicated().sum())

  print("Modeling Initiated for Selected Tickers, Please Wait...")

  # check if file aleady exists:
  if os.path.exists(out_filepath):
    print("Output File Already Exists, Incrementing")
    # increment filename by one & add timestamp:
    time = datetime.now().strftime("%Y%m%d_%H%M")
    out_filepath = out_filepath.replace(".csv", f"_{time}.csv")

    print("New Output Filepath:")
    print(out_filepath)

  from collections import Counter
  print(Counter(final_ticker_lst))

  # flag for first export iteration:
  first_iteration = True
  # initialize loop to cycle prediction dfs per ticker:
  for ticker in final_ticker_lst:
    print(f"Ticker Modeling Initialized: {ticker}")
    print("Please wait...")
    # filter input dfs to isolate ticker[i] time series
    ticker_train_df = train_df[train_df['Ticker'] == ticker]
    ticker_test_df = test_df[test_df['Ticker'] == ticker]
    # make predictions using daily_signal_prediction:
    pred_df = daily_signal_prediction(model_select, model_config_dict_name, parameters, ticker_train_df, ticker_test_df, pred_feat, active_fold, engineered_features)
    # Take pred_df as output w/o merging:
    output_df = pred_df

    # output merged df to csv -- only headers on 1st batch
    output_df.to_csv(out_filepath, mode='a', header=first_iteration, index=False)
    print(f"Ticker Export Complete: {ticker}")
    # reset flag
    first_iteration = False

  return None


In [175]:
# if predicted feature contains underscore:
# need to remove for file naming conventions
feat_abb = None
if "_" in predicted_feature:
  feat_abb = predicted_feature.replace("_","")
# determine where to store prediction results
output_location = assemble_output_path(active_fold, engineered_features, shared_drive, parameters, selected_model, feat_abb)

Output File Generating: Fold 1, Engineered Features
Output filepath assembled
/content/drive/MyDrive/Capstone_Docs_Shared/predictions_D5Label_KNN_fold_1_fe.csv


In [176]:
# iterate_and_export_per_ticker
iterate_and_export_per_ticker(output_location, selected_model, model_config_dict, parameters, train_df, test_df, predicted_feature, active_fold, engineered_features)

print(f'End of {selected_model} {active_fold} NoteBook Process!')
print("Predictions and Export Successful")

['AA' 'RF' 'BRO' 'HOG' 'SCI']
Ticker
AA     411
RF     411
BRO    411
HOG    411
SCI    411
Name: count, dtype: int64
2050
Modeling Initiated for Selected Tickers, Please Wait...
Output File Already Exists, Incrementing
New Output Filepath:
/content/drive/MyDrive/Capstone_Docs_Shared/predictions_D5Label_KNN_fold_1_fe_20250421_1749.csv
Counter({'AA': 1, 'RF': 1, 'BRO': 1, 'HOG': 1, 'SCI': 1})
Ticker Modeling Initialized: AA
Please wait...
TICKER COMPLETED:  AA
Ticker Export Complete: AA
Ticker Modeling Initialized: RF
Please wait...
TICKER COMPLETED:  RF
Ticker Export Complete: RF
Ticker Modeling Initialized: BRO
Please wait...
TICKER COMPLETED:  BRO
Ticker Export Complete: BRO
Ticker Modeling Initialized: HOG
Please wait...
TICKER COMPLETED:  HOG
Ticker Export Complete: HOG
Ticker Modeling Initialized: SCI
Please wait...
TICKER COMPLETED:  SCI
Ticker Export Complete: SCI
End of KNN 1-Train NoteBook Process!
Predictions and Export Successful
