In [300]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import datetime as dt
from sklearn.model_selection import GridSearchCV

In [301]:
#reading in data
#dropping extra index
df = pd.read_csv('Test Data/quarterly_data.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,year,month,decision,currentRatio,quickRatio,cashRatio,daysOfSalesOutstanding,daysOfInventoryOutstanding,operatingCycle,daysOfPayablesOutstanding,...,priceToSalesRatio,priceEarningsRatio,priceToFreeCashFlowsRatio,priceToOperatingCashFlowsRatio,priceCashFlowRatio,priceEarningsToGrowthRatio,priceSalesRatio,dividendYield,enterpriseValueMultiple,priceFairValue
0,2000,6,2.0,2.753337,2.558996,0.717565,193.4,0.351014,48.038685,73.01092,...,9.328204,85.119863,76.340684,72.75202,72.75202,137.685255,9.328204,0.0,111.74806,4.076622
1,2000,9,2.0,2.807553,2.576306,0.616141,186.013369,2.116892,47.983203,74.21953,...,4.475814,49.233954,65.903718,52.973242,52.973242,79.638163,4.475814,0.0,63.671433,2.037928
2,2000,12,2.0,3.009163,2.752596,1.061087,159.846077,1.838521,41.252623,55.768482,...,4.98054,-25.720019,-143.29725,-385.800289,-385.800289,-43.155875,4.98054,0.0,-11.078407,1.351132
3,2001,3,2.0,2.977716,2.66351,1.191086,162.477289,0.860421,40.923314,82.686424,...,5.337514,177.627508,-125.212833,-173.590519,-173.590519,305.918987,5.337514,0.0,869.220315,2.043334
4,2001,6,2.0,3.251549,2.983891,1.314126,147.979661,1.642651,38.130787,73.227666,...,5.497488,132.931059,-128.711026,-245.721049,-245.721049,230.71672,5.497488,0.0,79.262213,2.101813


In [302]:
#splitting data
X = df.drop(['decision'], 1)
y = df.decision

#train test split
X_train, X_test, y_train, y_test, = train_test_split(X, y)
df

Unnamed: 0,year,month,decision,currentRatio,quickRatio,cashRatio,daysOfSalesOutstanding,daysOfInventoryOutstanding,operatingCycle,daysOfPayablesOutstanding,...,priceToSalesRatio,priceEarningsRatio,priceToFreeCashFlowsRatio,priceToOperatingCashFlowsRatio,priceCashFlowRatio,priceEarningsToGrowthRatio,priceSalesRatio,dividendYield,enterpriseValueMultiple,priceFairValue
0,2000,6,2.0,2.753337,2.558996,0.717565,193.400000,0.351014,48.038685,73.010920,...,9.328204,85.119863,76.340684,72.752020,72.752020,137.685255,9.328204,0.000000,111.748060,4.076622
1,2000,9,2.0,2.807553,2.576306,0.616141,186.013369,2.116892,47.983203,74.219530,...,4.475814,49.233954,65.903718,52.973242,52.973242,79.638163,4.475814,0.000000,63.671433,2.037928
2,2000,12,2.0,3.009163,2.752596,1.061087,159.846077,1.838521,41.252623,55.768482,...,4.980540,-25.720019,-143.297250,-385.800289,-385.800289,-43.155875,4.980540,0.000000,-11.078407,1.351132
3,2001,3,2.0,2.977716,2.663510,1.191086,162.477289,0.860421,40.923314,82.686424,...,5.337514,177.627508,-125.212833,-173.590519,-173.590519,305.918987,5.337514,0.000000,869.220315,2.043334
4,2001,6,2.0,3.251549,2.983891,1.314126,147.979661,1.642651,38.130787,73.227666,...,5.497488,132.931059,-128.711026,-245.721049,-245.721049,230.716720,5.497488,0.000000,79.262213,2.101813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,2019,9,1.0,1.540126,1.384447,0.462022,261.062773,9.301986,73.673629,104.745891,...,15.344777,71.801806,57.355952,49.356078,49.356078,114.617760,15.344777,0.003540,51.283681,10.859777
78,2019,12,1.0,1.597782,1.439962,0.389297,158.793823,6.514434,45.669075,71.728737,...,13.934791,57.540859,45.037789,41.928121,41.928121,90.303099,13.934791,0.002766,45.340375,14.290900
79,2020,3,1.0,1.495962,1.297979,0.418070,192.017303,8.348218,55.694950,81.181037,...,18.523681,96.023774,94.272250,81.148782,81.148782,148.821951,18.523681,0.003125,68.338017,13.773305
80,2020,6,1.0,1.469450,1.312449,0.350228,196.152718,9.674909,58.041333,85.914066,...,25.551733,135.524323,103.702924,93.728425,93.728425,207.752079,25.551733,0.002397,96.317147,21.098686


In [303]:
#algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [304]:
#creating pipelines 
#Random Forest
pipeline_rf = Pipeline([('scale', StandardScaler()), ('clf', RandomForestClassifier())])

#DecisionTree
pipeline_dt = Pipeline([('scale', StandardScaler()), ('clf', DecisionTreeClassifier())])

#K Nearest Neighbors
pipeline_knn = Pipeline([('scale', StandardScaler()), ('clf', KNeighborsClassifier())])

In [305]:
#creating dictionary for models
pipelines = [pipeline_rf, pipeline_dt, pipeline_knn]

models = ['RandomForest', 'DecisionTree', 'KNN']
model_pipelines = dict(zip(models, pipelines))

In [306]:
# Looping through each Pipeline to fit and train each model
for name, pipe in model_pipelines.items():
    print(pipe)
    pipe.fit(X_train, y_train)

Pipeline(steps=[('scale', StandardScaler()), ('clf', RandomForestClassifier())])
Pipeline(steps=[('scale', StandardScaler()), ('clf', DecisionTreeClassifier())])
Pipeline(steps=[('scale', StandardScaler()), ('clf', KNeighborsClassifier())])


In [307]:
# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, pipe in model_pipelines.items():
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report - NEED to add Sell (wont work bc AAPL doesnt have a period to Sell)
    report = classification_report(y_test, pipe.predict(X_test), target_names=['Buy', 'Hold'], output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1

    print(f1)


RandomForest (Macro Avg - F1 Score):
0.81524926686217

DecisionTree (Macro Avg - F1 Score):
0.81524926686217

KNN (Macro Avg - F1 Score):
0.8328912466843501


In [308]:
# Creating parameters to iterate through for each classifier
rf_params = {'clf__n_estimators': [10,50,100,200],
             'clf__criterion': ['gini', 'entropy'],
             'clf__max_depth': [None, 2, 5],
             'clf__min_samples_split': [2,4,8],
             'clf__min_samples_leaf': [1, 2, 5]}

dt_params = {'clf__criterion': ['gini', 'entropy'],
             'clf__splitter': ['best', 'random'],
             'clf__max_depth': [None, 2, 5],
             'clf__min_samples_split': [2, 4, 8],
             'clf__min_samples_leaf': [1,3,5]}

knn_params = {"clf__n_neighbors": [5, 25, 50, 100, 200],
              "clf__weights": ['uniform', 'distance'],
              "clf__p": [1,2]}

In [309]:
# Dictionary for top 3 classifiers with their params
classifiers = {'RandomForest': [pipeline_rf, rf_params], 'DecisionTree': [pipeline_dt, dt_params],'KNN': [pipeline_knn, knn_params]}

In [310]:
# Dictionary storing the best estimators for each classifier
best_est = {}

# Grid Searching each classifier
for name, vals in classifiers.items():
    print(name + ' -'*30)
    
    # Grid Searching 
    gridsearch = GridSearchCV(vals[0], vals[1] , cv=3, return_train_score=True, verbose=2, scoring='f1_macro')
    gridsearch.fit(X_train, y_train)
    
    # Printing out the best parameters for the selected classifier
    print(name+' Best Parameters: ', gridsearch.best_params_)
    
    # Saving the best estimators/tuned models
    best_est[name+'_clf'] = gridsearch.best_estimator_

RandomForest - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] clf__criterion=gini, clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=10 
[CV]  clf__criterion=gini, clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=10, total=   0.0s
[CV] clf__criterion=gini, clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=10 
[CV]  clf__criterion=gini, clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=10, total=   0.1s
[CV] clf__criterion=gini, clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=10 
[CV]  clf__criterion=gini, clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=10, total=   0.0s
[CV] clf__criterion=gini, clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, c

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 40, n_neighbors = 50

In [313]:
# Looping through each tuned model's predictions and getting their classification reports
for name, pipe in best_est.items():
    print('\nTuned '+ name + ' (Macro Avg - F1 Score):')
    
    report = classification_report(y_test, pipe.predict(X_test), target_names=['Buy', 'Hold'], output_dict=True)
    print(report['macro avg']['f1-score'])


Tuned RandomForest_clf (Macro Avg - F1 Score):
0.81524926686217

Tuned DecisionTree_clf (Macro Avg - F1 Score):
0.81524926686217
