# Model Training

## Import packages and Dataframe

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import nbimporter
from helper import *
%run helper.ipynb

ModuleNotFoundError: No module named 'helper'

In [2]:
df = pd.read_csv('final_fundamental_data.csv')

print(df.shape)
df.head(10)

(1057, 17)


Unnamed: 0,ticker,company_name,price,52_week_delta,dividend_payout_ratio,foward_eps,g_revenue,gross_margin_pct,ebitda_margin_pct,price_to_foward_earnings,price_to_book,debt_to_equity,return_on_assets,return_on_equity,largest_institutional_owner,log_mktcap,analyst_rating
0,FLWS,"1-800 FLOWERS.COM, Inc.",12.84,0.03451,0.0,0.68,0.105,0.42145,0.0615,18.882,2.516167,52.765,0.04989,0.11756,Dimensional Fund Advisors LP,19.952916,1
1,TWOU,"2U, Inc.",22.99,-0.597681,0.0,-1.09,0.438,0.73874,-0.1169,-21.092,1.988066,42.584,-0.07259,-0.26075,"Vanguard Group, Inc. (The)",21.101246,0
2,DDD,3D Systems Corporation,8.55,-0.267826,0.0,0.09,-0.056,0.44765,-0.00901,95.0,1.950274,19.777,-0.04547,-0.12539,Blackrock Inc.,20.735688,0
3,MMM,3M Company,171.47,-0.171188,0.6738,9.69,-0.02,0.48029,0.24773,17.696,9.213368,189.511,0.10052,0.47079,"Vanguard Group, Inc. (The)",25.314378,0
4,AOS,A.O. Smith Corporation,46.79,0.091459,0.3667,2.53,-0.034,0.40032,0.19242,18.494,4.608036,22.401,0.10481,0.23737,"Vanguard Group, Inc. (The)",22.581592,1
5,AIR,AAR Corp.,44.96,0.092778,0.1266,2.96,0.161,0.1598,0.07353,15.189,1.702385,31.776,0.04381,0.08952,Blackrock Inc.,21.175783,1
6,AAN,"Aaron's, Inc.",59.05,0.325263,0.0481,4.49,0.011,0.47809,0.16109,13.151,2.123108,38.634,0.11479,0.11031,Blackrock Inc.,22.100851,1
7,ABT,Abbott Laboratories,85.48,0.201918,0.6739,3.6,0.055,0.58817,0.2471,23.744,4.751265,62.466,0.04215,0.10468,"Vanguard Group, Inc. (The)",25.741657,1
8,ABMD,"ABIOMED, Inc.",189.22,-0.415838,0.0,4.95,0.128,0.82834,0.32333,38.226,8.633875,1.281,0.15436,0.2475,"Vanguard Group, Inc. (The)",22.868524,1
9,ABM,ABM Industries Incorporated,38.14,0.365978,0.5417,2.1,0.015,0.1107,0.04759,18.162,1.684108,62.297,0.03365,0.05966,Blackrock Inc.,21.651832,1


## Drop unneeded columns and create dummies

In [3]:
df.drop(['ticker', 'company_name', 'price'], axis=1, inplace=True)

In [4]:
inst_dummies = pd.get_dummies(df['largest_institutional_owner'], drop_first=True)

df.drop('largest_institutional_owner', axis=1, inplace=True)
df = pd.concat([df, inst_dummies], axis=1)

df.head()

## Create a train group and a test group

In [6]:
X = df.drop(['analyst_rating'], axis=1)
y = df['analyst_rating']
feature_cols = X.columns

## Create a baseline model

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_log_pred = logreg.predict(X_test)

dictionary = dict(zip(list(X_train.columns), list(logreg.coef_[0])))
dictionary

{'52_week_delta': 0.8130334450647939,
 'dividend_payout_ratio': -1.3714470720944554,
 'foward_eps': 0.002143401219271869,
 'g_revenue': 1.4674940595745034,
 'gross_margin_pct': 0.21003430972247164,
 'ebitda_margin_pct': 0.5820988565852852,
 'price_to_foward_earnings': -0.0017501896010852392,
 'price_to_book': 0.02536050910660828,
 'debt_to_equity': 8.439284837984526e-05,
 'return_on_assets': -1.0278788473885485,
 'return_on_equity': -0.7429566240695976,
 'log_mktcap': 0.08889168754603873,
 'Dimensional Fund Advisors LP': 0.424995362681745,
 'FMR, LLC': 0.3540224374972843,
 'Price (T.Rowe) Associates Inc': 0.30599635146541476,
 'Vanguard Group, Inc. (The)': 0.07674303607930966}

In [82]:
base_accuracy = metrics.accuracy_score(y_test, y_log_pred)
base_precision = metrics.precision_score(y_test, y_log_pred)
base_recall = metrics.recall_score(y_test, y_log_pred)
base_f1 = metrics.f1_score(y_test, y_log_pred)

print('--'*16)
print('Baseline Model Performance:')
print(f' Accuracy: {base_accuracy : 0.2%}')
print(f' Precision: {base_precision : 0.2%}')
print(f' Recall: {base_recall : 0.2%}')
print(f' F1 Score: {base_f1 : 0.2%}')
print('--'*16)

--------------------------------
Baseline Model Performance:
 Accuracy:  71.23%
 Precision:  72.77%
 Recall:  93.92%
 F1 Score:  82.01%
--------------------------------


### Examine the Confusion Matrix

In [86]:
base_cm = confusion_matrix(y_test,y_log_pred)
base_classes = ['Buy', 'Not Buy']

## K Nearest Neighbors

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
neighbors = list(range(1,14,2))
for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    y_knn_pred = knn.predict(X_test)
    print('--'*16)
    print(f'K = {neighbor}')
    print(f' Accuracy: {metrics.accuracy_score(y_test, y_knn_pred) : 0.2%}')
    print(f' Precision: {metrics.precision_score(y_test, y_knn_pred) : 0.2%}')
    print(f' Recall: {metrics.recall_score(y_test, y_knn_pred) : 0.2%}')
    print(f' F1 Score: {metrics.f1_score(y_test, y_knn_pred) : 0.2%}')

print('--'*16)

--------------------------------
K = 1
 Accuracy:  56.60%
 Precision:  69.44%
 Recall:  67.57%
 F1 Score:  68.49%
--------------------------------
K = 3
 Accuracy:  54.72%
 Precision:  68.31%
 Recall:  65.54%
 F1 Score:  66.90%
--------------------------------
K = 5
 Accuracy:  56.60%
 Precision:  68.18%
 Recall:  70.95%
 F1 Score:  69.54%
--------------------------------
K = 7
 Accuracy:  61.32%
 Precision:  70.37%
 Recall:  77.03%
 F1 Score:  73.55%
--------------------------------
K = 9
 Accuracy:  64.15%
 Precision:  70.00%
 Recall:  85.14%
 F1 Score:  76.83%
--------------------------------
K = 11
 Accuracy:  66.51%
 Precision:  71.04%
 Recall:  87.84%
 F1 Score:  78.55%
--------------------------------
K = 13
 Accuracy:  67.92%
 Precision:  71.28%
 Recall:  90.54%
 F1 Score:  79.76%
--------------------------------


### Examine the Confusion Matrix

Even at K=13, KNN did not beat our base model 

## Decision Tree

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
dt_clf = DecisionTreeClassifier()

dt_cv_score = cross_val_score(dt_clf, X_train, y_train, cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)

print(f"Mean Cross Validation Score: {mean_dt_cv_score :.2%}")

Mean Cross Validation Score: 62.13%


In [None]:




inst_dummies = pd.get_dummies(df['largest_institutional_owner'], drop_first=True)

X.drop('largest_institutional_owner', axis=1, inplace=True)

X = pd.concat([X, inst_dummies], axis=1)

print(X.shape)
X.head(10)

y




## Decision Tree Model


dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6]
}

# Instantiate GridSearchCV
dt_grid_search = GridSearchCV(dt_clf, dt_param_grid, cv=3, return_train_score=True)

# Fit to the data
dt_grid_search.fit(X_train, y_train)

X = df.drop(['ticker', 'company_name', 'price', 'shares_outstanding', 'analyst_rating'], axis=1)
y = df['analyst_rating']
feature_cols = X.columns

inst_dummies = pd.get_dummies(df['largest_institutional_owner'], drop_first=True)

X.drop('largest_institutional_owner', axis=1, inplace=True)

X = pd.concat([X, inst_dummies], axis=1)

print(X.shape)
X.head(10)

y

## Create a baseline model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

dictionary = dict(zip(list(X_train.columns), list(logreg.coef_[0])))
dictionary

y_pred = logreg.predict(X_test)

print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

## KNN Model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

## Decision Tree Model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_clf = DecisionTreeClassifier()

dt_cv_score = cross_val_score(dt_clf, X_train, y_train, cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)

print(f"Mean Cross Validation Score: {mean_dt_cv_score :.2%}")

dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6]
}

# Instantiate GridSearchCV
dt_grid_search = GridSearchCV(dt_clf, dt_param_grid, cv=3, return_train_score=True)

# Fit to the data
dt_grid_search.fit(X_train, y_train)

# Instantiate GridSearchCV
dt_grid_search = GridSearchCV(dt_clf, dt_param_grid, cv=3, return_train_score=True)

# Fit to the data
dt_grid_search.fit(X_train, y_train)

# Mean training score
dt_gs_training_score = np.mean(dt_grid_search.cv_results_['mean_train_score'])

# Mean test score
dt_gs_testing_score = dt_grid_search.score(X_test, y_test)

print(f"Mean Training Score: {dt_gs_training_score :.2%}")
print(f"Mean Test Score: {dt_gs_testing_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search.best_params_

## Random Forest Model

rf_clf = RandomForestClassifier()
mean_rf_cv_score = np.mean(cross_val_score(rf_clf, X_train, y_train, cv=3))

print(f"Mean Cross Validation Score for Random Forest Classifier: {mean_rf_cv_score :.2%}")

rf_param_grid = {
    'n_estimators': [10, 30, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 6, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 6]
}

rf_grid_search = GridSearchCV(rf_clf, rf_param_grid, cv=3)
rf_grid_search.fit(X_train, y_train)

print(f"Training Accuracy: {rf_grid_search.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

dt_score = dt_grid_search.score(X_test, y_test)
rf_score = rf_grid_search.score(X_test, y_test)

print('Decision tree grid search: ', dt_score)
print('Random forest grid search: ', rf_score)