In [1]:
import pandas as pd
import numpy as np
import datetime 
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, RepeatedKFold
from sklearn.linear_model import Lasso, LogisticRegression

from sklearn.decomposition import PCA
from sklearn import ensemble, tree, metrics


## Define helper functions
We are going to need to split our data and define a function to run the grid search

In [2]:
def train_test_split(data, label, train_size=.8):
    """ 
    Split processed data into train and test sets. Extract labels. Also stores indexed labels and data in the process.
    Data is required to have the label_col column that represents train and test labels.
    """
    # Carve out holdout data
    holdout = data.sample(frac=.2)
    non_holdout_data = data.drop(holdout.index, axis=0)
    holdout_labels = holdout[label] 
    holdout_data = holdout.drop([label], axis=1)

    # split out train and test from non-holdout data
    labeled_train_data = non_holdout_data.sample(frac=.8)
    labeled_test_data = non_holdout_data.drop(labeled_train_data.index)

    train_data = labeled_train_data.drop([label], axis=1).reset_index(drop=True)
    train_labels = labeled_train_data[label].reset_index(drop=True)

    test_data = labeled_test_data.drop([label], axis=1).reset_index(drop=True)
    test_labels = labeled_test_data[label].reset_index(drop=True)

    # normalize data
    scaler = StandardScaler()
    normed_train_data = scaler.fit_transform(train_data)
    normed_test_data = scaler.transform(test_data)
    normed_holdout_data = scaler.transform(holdout_data)

    return normed_train_data, normed_test_data, normed_holdout_data, train_labels, test_labels, holdout_labels

        
def run_grid(clf, grid, train_data, test_data, train_labels, test_labels, n_jobs=-1, score="accuracy"):
    """ 
    Runs SKLearn implementation of grid search 
    clf: un-fit SKLearn classifier
    grid: params upon which grid search will run
    data: pandas DataFrame of model data, inclusive of label
    label: name of column that holds classification labels

    returns: Most accurate model
    """
    grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=n_jobs, scoring=score)
    best_clf = grid_search.fit(train_data, train_labels).best_estimator_

    # get model predictions and accuracy
    preds = best_clf.predict(test_data)
    acc = metrics.accuracy_score(preds, test_labels)
    print(f"PREDS: {preds.mean():.2f}")
    acc = metrics.accuracy_score(test_labels, preds)
    print(f"Model Accuracy: {acc*100:.1f}%")

    return best_clf

Split out data and define param lists that we will iterate through when running the grid search.

For this exercise, we will use a random forest classifier

In [7]:
model_data = pd.read_csv("C:/Users/lbianculli/equity_analysis/model_data_final.csv").drop("Unnamed: 0", axis=1)
train_data, test_data, holdout_data, train_labels, test_labels, holdout_labels = train_test_split(model_data, "label")

# save data and labels so we are always using/not using the same data!
with open("C:/Users/lbianculli/dev/us_equities/models/train_data.p", "wb") as f:
    pickle.dump([train_data, test_data, holdout_data, train_labels, test_labels, holdout_labels], f)
    
    
# define params to iterate over for search
print(f"Number of features: {train_data.shape[1]}")

# set up grid search for Random Forest Classifier
# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [None, 25, 50]

# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5]

# Method of selecting samples for training each tree
class_weight = [None, "balanced"]

rf_grid = { 
    'max_features': max_features, 
    'max_depth': max_depth, 
    'min_samples_split': min_samples_split, 
    "class_weight": class_weight}



Number of features: 14


In [6]:
# set up classifier and run search
rf_clf = ensemble.RandomForestClassifier(n_estimators=500, n_jobs=-1)
best_rf = run_grid(rf_clf, rf_grid, train_data=train_data, test_data=test_data, train_labels=train_labels, \
                         test_labels=test_labels, n_jobs=6, score="accuracy")

# save model for evaluation
with open("C:/Users/lbianculli/dev/us_equities/models/sk_random_forest_final.f", "wb") as f:
    pickle.dump(best_rf, f)

PREDS: -0.70
Model Accuracy: 86.1%


## Now that we have our classifier, we move onto model evaluation