In [1]:
# libraries!
import numpy as np      # numpy is Python's "array" library
import pandas as pd     # Pandas is Python's "data" library ("dataframe" == spreadsheet)

In [2]:
# let's read in our stock data...
# 

filename = 'stocks_cleaned.csv'
df_tidy = pd.read_csv(filename)      # encoding = "utf-8", "latin1"
print(f"{filename} : file read into a pandas dataframe.")

stocks_cleaned.csv : file read into a pandas dataframe.


In [3]:
df_tidy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2324923 entries, 0 to 2332530
Data columns (total 11 columns):
Open            float64
High            float64
Low             float64
Close           float64
Volume          int64
Target          float64
Year            int64
Month           int64
Day             int64
twoclass        int64
multiclasses    int64
dtypes: float64(5), int64(6)
memory usage: 212.9 MB


In [4]:
df_tidy.head()

Unnamed: 0,Open,High,Low,Close,Volume,Target,Year,Month,Day,twoclass,multiclasses
0,2734.0,2755.0,2730.0,2742.0,31400,0.00073,2017,1,4,1,5
1,568.0,576.0,563.0,571.0,2798500,0.012324,2017,1,4,1,7
2,3150.0,3210.0,3140.0,3210.0,270800,0.006154,2017,1,4,1,6
3,1510.0,1550.0,1510.0,1550.0,11300,0.011053,2017,1,4,1,7
4,3270.0,3350.0,3270.0,3330.0,150800,0.003026,2017,1,4,1,5


### 2 Classes Experiment

In [5]:
#
# All of the columns need to be numeric, we'll drop Target and multiclasses
ROW = 0
COLUMN = 1
df_model1 = df_tidy.drop(['Target', 'multiclasses'], axis=COLUMN )
df_model1.head()

Unnamed: 0,Open,High,Low,Close,Volume,Year,Month,Day,twoclass
0,2734.0,2755.0,2730.0,2742.0,31400,2017,1,4,1
1,568.0,576.0,563.0,571.0,2798500,2017,1,4,1
2,3150.0,3210.0,3140.0,3210.0,270800,2017,1,4,1
3,1510.0,1550.0,1510.0,1550.0,11300,2017,1,4,1
4,3270.0,3350.0,3270.0,3330.0,150800,2017,1,4,1


In [63]:
# let's create a dictionary to look up any column index by name
COLUMNS = df_model1.columns
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}\n\n")


#
# and our "class" names
#


SPECIES = ['negative', 'positive']   # int to str
SPECIES_INDEX = {'negative':0,'positive':1}  # str to int

# Let's try it out...
for name in SPECIES:
    print(f"{name} maps to {SPECIES_INDEX[name]}")

COL_INDEX is {'Open': 0, 'High': 1, 'Low': 2, 'Close': 3, 'Volume': 4, 'Year': 5, 'Month': 6, 'Day': 7, 'multiclasses': 8}


negative maps to 0
positive maps to 1


In [7]:
#
# let's convert our dataframe to a numpy array, named A
#
A = df_model1.to_numpy()   
print(A)

[[2.734e+03 2.755e+03 2.730e+03 ... 1.000e+00 4.000e+00 1.000e+00]
 [5.680e+02 5.760e+02 5.630e+02 ... 1.000e+00 4.000e+00 1.000e+00]
 [3.150e+03 3.210e+03 3.140e+03 ... 1.000e+00 4.000e+00 1.000e+00]
 ...
 [1.690e+03 1.690e+03 1.645e+03 ... 1.200e+01 3.000e+00 0.000e+00]
 [2.388e+03 2.396e+03 2.380e+03 ... 1.200e+01 3.000e+00 1.000e+00]
 [6.900e+02 7.110e+02 6.860e+02 ... 1.200e+01 3.000e+00 1.000e+00]]


In [8]:
#
# let's make sure it's all floating-point (here, it already is, but in other datasets it might not be)
#
A = A.astype('float64')  
print(A)

[[2.734e+03 2.755e+03 2.730e+03 ... 1.000e+00 4.000e+00 1.000e+00]
 [5.680e+02 5.760e+02 5.630e+02 ... 1.000e+00 4.000e+00 1.000e+00]
 [3.150e+03 3.210e+03 3.140e+03 ... 1.000e+00 4.000e+00 1.000e+00]
 ...
 [1.690e+03 1.690e+03 1.645e+03 ... 1.200e+01 3.000e+00 0.000e+00]
 [2.388e+03 2.396e+03 2.380e+03 ... 1.200e+01 3.000e+00 1.000e+00]
 [6.900e+02 7.110e+02 6.860e+02 ... 1.200e+01 3.000e+00 1.000e+00]]


In [9]:
#
# nice to have NUM_ROWS and NUM_COLS around
#
NUM_ROWS, NUM_COLS = A.shape
print(f"\nThe dataset has {NUM_ROWS} rows and {NUM_COLS} cols")


The dataset has 2324923 rows and 9 cols


In [11]:
# let's use all of our variables, to reinforce that we have
# (1) names...
# (2) access and control...

# choose a row index, n:
n = 42
print(f"flower #{n} is {A[n]}")

for i in range(len(COLUMNS)):
    colname = COLUMNS[i]
    value = A[n][i]
    print(f"  Its {colname} is {value}")

species_index = COL_INDEX['twoclass']
species_num = int(round(A[n][species_index]))
species = SPECIES[species_num]
print(f"  Its class is {species} (i.e., {species_num})")

flower #42 is [3.430e+02 3.490e+02 3.430e+02 3.480e+02 7.410e+05 2.017e+03 1.000e+00
 4.000e+00 0.000e+00]
  Its Open is 343.0
  Its High is 349.0
  Its Low is 343.0
  Its Close is 348.0
  Its Volume is 741000.0
  Its Year is 2017.0
  Its Month is 1.0
  Its Day is 4.0
  Its twoclass is 0.0
  Its class is negative (i.e., 0)


In [12]:
print("+++ Start of data definitions +++\n")

#
# we could do this at the data-frame level, too!
#

X_all = A[:,0:8]  # X (features) ... is all rows except column 8
y_all = A[:,8]    # y (labels) ... is all rows, column 8 only

print(f"y_all (just the labels/species)   are \n {y_all}")
print(f"X_all (just the features, first few rows) are \n {X_all[0:5]}")

+++ Start of data definitions +++

y_all (just the labels/species)   are 
 [1. 1. 1. ... 0. 1. 1.]
X_all (just the features, first few rows) are 
 [[2.7340e+03 2.7550e+03 2.7300e+03 2.7420e+03 3.1400e+04 2.0170e+03
  1.0000e+00 4.0000e+00]
 [5.6800e+02 5.7600e+02 5.6300e+02 5.7100e+02 2.7985e+06 2.0170e+03
  1.0000e+00 4.0000e+00]
 [3.1500e+03 3.2100e+03 3.1400e+03 3.2100e+03 2.7080e+05 2.0170e+03
  1.0000e+00 4.0000e+00]
 [1.5100e+03 1.5500e+03 1.5100e+03 1.5500e+03 1.1300e+04 2.0170e+03
  1.0000e+00 4.0000e+00]
 [3.2700e+03 3.3500e+03 3.2700e+03 3.3300e+03 1.5080e+05 2.0170e+03
  1.0000e+00 4.0000e+00]]


In [13]:
#
# We next separate into test data and training data ... 
#    + We will train on the training data...
#    + We will _not_ look at the testing data to build the model
#
# Then, afterward, we will test on the testing data -- and see how well we do!
#

#
# a common convention:  train on 80%, test on 20%    Let's define the TEST_PERCENT
#

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows\n" )

print(f"Held-out data... (testing data: {len(y_test)})")
print(f"y_test: {y_test}\n")
print(f"X_test (few rows): {X_test[0:5,:]}")  # 5 rows
print()
print(f"Data used for modeling... (training data: {len(y_train)})")
print(f"y_train: {y_train}\n")
print(f"X_train (few rows): {X_train[0:5,:]}")  # 5 rows

training with 1859938 rows;  testing with 464985 rows

Held-out data... (testing data: 464985)
y_test: [0. 0. 1. ... 1. 1. 1.]

X_test (few rows): [[1.980e+03 1.980e+03 1.951e+03 1.956e+03 7.100e+04 2.019e+03 6.000e+00
  2.400e+01]
 [1.341e+03 1.378e+03 1.331e+03 1.363e+03 5.425e+05 2.018e+03 8.000e+00
  8.000e+00]
 [2.180e+03 2.195e+03 2.148e+03 2.184e+03 4.860e+04 2.021e+03 8.000e+00
  2.000e+01]
 [3.425e+03 3.445e+03 3.275e+03 3.295e+03 4.690e+05 2.021e+03 5.000e+00
  1.200e+01]
 [2.337e+03 2.438e+03 2.301e+03 2.438e+03 2.803e+05 2.020e+03 3.000e+00
  3.000e+01]]

Data used for modeling... (training data: 1859938)
y_train: [1. 1. 1. ... 0. 0. 1.]

X_train (few rows): [[3.6900e+02 3.8500e+02 3.6900e+02 3.8100e+02 8.9090e+05 2.0200e+03
  9.0000e+00 8.0000e+00]
 [2.0500e+03 2.0520e+03 1.9780e+03 1.9810e+03 1.2150e+05 2.0180e+03
  1.0000e+00 3.0000e+01]
 [8.5700e+02 8.5700e+02 8.4600e+02 8.5000e+02 1.4980e+05 2.0170e+03
  8.0000e+00 3.0000e+00]
 [4.6100e+02 4.6700e+02 4.6000e+02 4.6000e

In [21]:
# Also read the extra testing data

filename_test = 'stocks_cleaned_test.csv'
df_tidy_test = pd.read_csv(filename_test)      # encoding = "utf-8", "latin1"
print(f"{filename_test} : file read into a pandas dataframe.")

#
# All of the columns need to be numeric, we'll drop irisname
ROW = 0
COLUMN = 1
df_model1_test = df_tidy_test.drop(['Target', 'multiclasses'], axis=COLUMN )
df_model1_test.head()

#
# let's convert our dataframe to a numpy array, named B
#
B = df_model1_test.to_numpy()   
print(B)

#
# let's make sure it's all floating-point (here, it already is, but in other datasets it might not be)
#
B = B.astype('float64')  
print(B)

#
# nice to have NUM_ROWS and NUM_COLS around
#
NUM_ROWS, NUM_COLS = B.shape
print(f"\nThe dataset has {NUM_ROWS} rows and {NUM_COLS} cols")

print("+++ Start of data definitions +++\n")

#
# we could do this at the data-frame level, too!
#

X_all_test = B[:,0:8]  # X (features) ... is all rows except column 8
y_all_test = B[:,8]    # y (labels) ... is all rows, column 8 only

print(f"y_all (just the labels/species)   are \n {y_all_test}")
print(f"X_all (just the features, first few rows) are \n {X_all_test[0:5]}")



stocks_cleaned_test.csv : file read into a pandas dataframe.
[[2.982e+03 2.982e+03 2.965e+03 ... 1.200e+01 6.000e+00 0.000e+00]
 [5.920e+02 5.990e+02 5.880e+02 ... 1.200e+01 6.000e+00 0.000e+00]
 [2.368e+03 2.388e+03 2.360e+03 ... 1.200e+01 6.000e+00 0.000e+00]
 ...
 [1.600e+03 1.622e+03 1.600e+03 ... 2.000e+00 2.800e+01 1.000e+00]
 [2.568e+03 2.568e+03 2.540e+03 ... 2.000e+00 2.800e+01 0.000e+00]
 [7.310e+02 7.370e+02 7.260e+02 ... 2.000e+00 2.800e+01 0.000e+00]]
[[2.982e+03 2.982e+03 2.965e+03 ... 1.200e+01 6.000e+00 0.000e+00]
 [5.920e+02 5.990e+02 5.880e+02 ... 1.200e+01 6.000e+00 0.000e+00]
 [2.368e+03 2.388e+03 2.360e+03 ... 1.200e+01 6.000e+00 0.000e+00]
 ...
 [1.600e+03 1.622e+03 1.600e+03 ... 2.000e+00 2.800e+01 1.000e+00]
 [2.568e+03 2.568e+03 2.540e+03 ... 2.000e+00 2.800e+01 0.000e+00]
 [7.310e+02 7.370e+02 7.260e+02 ... 2.000e+00 2.800e+01 0.000e+00]]

The dataset has 111716 rows and 9 cols
+++ Start of data definitions +++

y_all (just the labels/species)   are 
 [0. 0. 0

### KNN

In [14]:
#
# we also use "cross validation"
#

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
#
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"
#
best_k = 84  # Not correct!
best_accuracy = 0.0  # also not correct...

# Note that we are cross-validating using only our TEST data!
for k in range(1,20):
    knn_cv_model = KNeighborsClassifier(n_neighbors=k)   # build knn_model for every k!
    cv_scores = cross_val_score( knn_cv_model, X_train, y_train, cv=5 )  # cv=5 means 80/20
    print(cv_scores)  # just to see the five scores... 
    average_cv_accuracy = cv_scores.mean()  # mean() is numpy's built-in average function 
    print(f"k: {k:2d}  cv accuracy: {average_cv_accuracy:7.4f}")

    
# assign best value of k to best_k
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_k = k      # at the moment this is incorrect   
# you'll need to use the loop above to find and remember the real best_k

print(f"best_k = {best_k}   yields the highest average cv accuracy.")  # print the best one

[0.50189791 0.50092476 0.50309419 0.50298532 0.50078632]
k:  1  cv accuracy:  0.5019
[0.51137402 0.50967773 0.51144929 0.51014955 0.51076516]
k:  2  cv accuracy:  0.5107
[0.50272589 0.50192748 0.50260223 0.50263853 0.50243154]
k:  3  cv accuracy:  0.5025
[0.50983096 0.50946267 0.50875566 0.50831077 0.5091764 ]
k:  4  cv accuracy:  0.5091
[0.50326086 0.50271783 0.50246782 0.50181324 0.50236971]
k:  5  cv accuracy:  0.5025
[0.50868039 0.50789542 0.50780133 0.50812528 0.5081683 ]
k:  6  cv accuracy:  0.5081
[0.50297859 0.50342484 0.50276891 0.50198797 0.50277026]
k:  7  cv accuracy:  0.5028
[0.5076696  0.50710507 0.50841694 0.50660641 0.50838336]
k:  8  cv accuracy:  0.5076
[0.50262912 0.5032286  0.50365603 0.50212239 0.50360362]
k:  9  cv accuracy:  0.5030
[0.50685237 0.50779595 0.5076696  0.50598274 0.50777312]
k: 10  cv accuracy:  0.5072
[0.50216942 0.50373937 0.50415605 0.50251218 0.50317081]
k: 11  cv accuracy:  0.5031


KeyboardInterrupt: 

In [15]:
#
# With the best k, we build and train a new model:
#
# Now, we use best_k instead of the original, randomly-guessed value    
#
best_k = 2
from sklearn.neighbors import KNeighborsClassifier
knn_model_tuned = KNeighborsClassifier(n_neighbors=best_k)   # here, we use the best_k!

# we train the model (one line!)
knn_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print(f"Created + trained a knn classifier, now tuned with a (best) k of {best_k}")  

# How does it do?!  The next cell will show...

Created + trained a knn classifier, now tuned with a (best) k of 2


In [17]:
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = knn_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.\n\n")

Predicted labels: [1. 0. 0. ... 1. 0. 0.]
Actual labels: [0. 0. 1. ... 1. 1. 1.]

Results on test set:  237946 correct out of 464985 total.




In [18]:
#
# Ok!  We have tuned knn to use the "best" value of k...
#
# And, we should now use ALL available data to train our final predictive model:
#

knn_model_final = KNeighborsClassifier(n_neighbors=best_k)   # here, we use the best_k
knn_model_final.fit(X_all, y_all)                              # here we use ALL the data!
print(f"Created + trained a 'final' knn classifier, with a (best) k of {best_k}") 

Created + trained a 'final' knn classifier, with a (best) k of 2


In [62]:
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = knn_model_final.predict(X_all_test)
actual_labels = y_all_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.\n\n")

Predicted labels: [1. 0. 0. ... 0. 0. 0.]
Actual labels: [0. 0. 0. ... 1. 0. 0.]

Results on test set:  57950 correct out of 111716 total, which is  0.5187.




In [64]:
# Save model

import pickle
# save the classifier
with open('knn_model_final.pkl', 'wb') as fid:
    pickle.dump(knn_model_final, fid) 

In [66]:
# load it again

with open('knn_model_final.pkl', 'rb') as fid:
    testmodel = pickle.load(fid)

### Decision Tree

In [26]:
#
# To compare different tree-depths, we use cross validation
#
from sklearn import tree      # for decision trees

best_d = 1
best_accuracy = 0.0

for d in range(1,20):
    cv_model = tree.DecisionTreeClassifier(max_depth=d)   # for each depth, d
    cv_scores = cross_val_score( cv_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
    # print(cv_scores)  # we usually don't want to see the five individual scores 
    average_cv_accuracy = cv_scores.mean()  # more likely, only their average
    print(f"depth: {d:2d}  cv accuracy: {average_cv_accuracy:7.4f}")
    
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_d = d

    
    
# assign best value of d to best_depth
best_depth = best_d   # may have to hand-tune this, depending on what happens...
print()
print(f"best_depth = {best_depth} is our choice for an underfitting/overfitting balance.")  

depth:  1  cv accuracy:  0.5189
depth:  2  cv accuracy:  0.5221
depth:  3  cv accuracy:  0.5247
depth:  4  cv accuracy:  0.5314
depth:  5  cv accuracy:  0.5389
depth:  6  cv accuracy:  0.5469
depth:  7  cv accuracy:  0.5569
depth:  8  cv accuracy:  0.5698
depth:  9  cv accuracy:  0.5878
depth: 10  cv accuracy:  0.6072
depth: 11  cv accuracy:  0.6231
depth: 12  cv accuracy:  0.6393
depth: 13  cv accuracy:  0.6512
depth: 14  cv accuracy:  0.6601
depth: 15  cv accuracy:  0.6663
depth: 16  cv accuracy:  0.6693
depth: 17  cv accuracy:  0.6696
depth: 18  cv accuracy:  0.6689
depth: 19  cv accuracy:  0.6669

best_depth = 17 is our choice for an underfitting/overfitting balance.


In [27]:
#
# Now, we re-create and re-run the  "Model-building and -training Cell"
#
# this time, with the best depth, best_d, found by cross-validation model tuning:
#

# we should have best_depth from our cv exploration
dtree_model_tuned = tree.DecisionTreeClassifier(max_depth=best_depth)

# we train the model (it's one line!)
dtree_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print("Created and trained a DT classifier with max depth =", best_depth) 

Created and trained a DT classifier with max depth = 17


In [28]:
#
# +++ This cell will "Model-testing Cell"
#
# Now, let's see how well our model does on our "held-out data" (the testing data)
#

# We run our test set:

# the function knn_model.predict is the instantiation of our model
# it's what runs the k-nearest-neighbors algorithm:
predicted_labels = dtree_model_tuned.predict(X_test)   
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.")

Predicted labels: [0. 0. 1. ... 0. 0. 0.]
Actual  labels  : [0. 0. 1. ... 1. 1. 1.]

Results on test set:  312814 correct out of 464985 total, which is  0.6727.


In [38]:
#
# Ok!  We have tuned our DT to use the "best" depth...
#
# Now, we use ALL available data to train our final predictive model:
#


# we should have best_depth from our cv exploration
dtree_model_final = tree.DecisionTreeClassifier(max_depth=best_depth)

# we train the model (it's one line!)
dtree_model_final.fit(X_all, y_all)                              # yay!  trained!
print("Created and trained a 'final' DT classifier with max depth =", best_depth) 

Created and trained a 'final' DT classifier with max depth = 17


In [39]:
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = dtree_model_final.predict(X_all_test)
actual_labels = y_all_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.\n\n")

Predicted labels: [1. 1. 1. ... 0. 0. 1.]
Actual labels: [0. 0. 0. ... 1. 0. 0.]

Results on test set:  52676 correct out of 111716 total, which is  0.4715.




In [40]:
print(dtree_model_final.feature_importances_)
print()

# let's see them with each feature name:
IMPs = dtree_model_final.feature_importances_

# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    print(f"Feature {COLUMNS[i]:>12s} has {perc:>7.2f}% of the decision-making importance.")

[0.03773508 0.03849803 0.03710759 0.03749145 0.11714993 0.20678224
 0.24853093 0.27670476]

Feature         Open has    3.77% of the decision-making importance.
Feature         High has    3.85% of the decision-making importance.
Feature          Low has    3.71% of the decision-making importance.
Feature        Close has    3.75% of the decision-making importance.
Feature       Volume has   11.71% of the decision-making importance.
Feature         Year has   20.68% of the decision-making importance.
Feature        Month has   24.85% of the decision-making importance.
Feature          Day has   27.67% of the decision-making importance.


In [67]:
# save the classifier

with open('dtree_model_final.pkl', 'wb') as fid:
    pickle.dump(dtree_model_final, fid) 

### Random Forest

In [37]:
#
# So, to compare different parameters, let's use cv
#
from sklearn import ensemble  # for random forests, an ensemble classifier

best_d = 1
best_ntrees = 10   
best_accuracy = 0

for d in range(1,10):
    for ntrees in range(10,60,20):
        rforest_model = ensemble.RandomForestClassifier(max_depth=d, 
                                                        n_estimators=ntrees,
                                                        verbose=1)
        cv_scores = cross_val_score( rforest_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
        average_cv_accuracy = cv_scores.mean()  # more likely, only their average
        print(f"depth: {d:2d} ntrees: {ntrees:3d} cv accuracy: {average_cv_accuracy:7.4f}")

        if average_cv_accuracy > best_accuracy:
            best_d = d
            best_ntrees = ntrees
            best_accuracy = average_cv_accuracy


best_depth = best_d   
best_num_trees = best_ntrees


print()
print(f"best_depth: {best_depth} and best_num_trees: {best_num_trees} are our choices.")  


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


depth:  1 ntrees:  10 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   21.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   22.3s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   22.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   20.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   29.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished


depth:  1 ntrees:  30 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.2s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   34.4s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.4s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.7s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished


depth:  1 ntrees:  50 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


depth:  2 ntrees:  10 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   39.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   40.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   39.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   43.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   41.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished


depth:  2 ntrees:  30 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished


depth:  2 ntrees:  50 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   19.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   19.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   20.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   20.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


depth:  3 ntrees:  10 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   55.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   58.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   55.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.6s finished


depth:  3 ntrees:  30 cv accuracy:  0.5189


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.0s finished


depth:  3 ntrees:  50 cv accuracy:  0.5190


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   30.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   26.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   24.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   26.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   25.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


depth:  4 ntrees:  10 cv accuracy:  0.5226


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.7s finished


depth:  4 ntrees:  30 cv accuracy:  0.5213


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished


depth:  4 ntrees:  50 cv accuracy:  0.5214


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   32.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   33.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   34.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   37.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   37.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


depth:  5 ntrees:  10 cv accuracy:  0.5289


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.7min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.7s finished


depth:  5 ntrees:  30 cv accuracy:  0.5257


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.2min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.2s finished


depth:  5 ntrees:  50 cv accuracy:  0.5250


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   40.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   42.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   39.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   40.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   40.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


depth:  6 ntrees:  10 cv accuracy:  0.5346


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished


depth:  6 ntrees:  30 cv accuracy:  0.5334


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.7min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.6min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished


KeyboardInterrupt: 

In [42]:
#
# Now, we re-create and re-run the  "Model-building and -training Cell"
#

best_depth = 6
best_num_trees = 30
# we should have best_depth and best_num_trees
rforest_model_tuned = ensemble.RandomForestClassifier(max_depth=best_depth, 
                                                      n_estimators=best_num_trees)

# we train the model (it's one line!)
rforest_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print(f"Built an RF classifier with depth={best_depth} and ntrees={best_num_trees}") 

Built an RF classifier with depth=6 and ntrees=30


In [43]:
#
# +++ This is our "Model-testing Cell"
#
# Now, let's see how well we did on our "held-out data" (the testing data)
#

# We run our test set!
predicted_labels = rforest_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.")

Predicted labels: [0. 0. 0. ... 0. 0. 0.]
Actual  labels  : [0. 0. 1. ... 1. 1. 1.]

Results on test set:  248301 correct out of 464985 total.


In [44]:
#
# Ok!  We have tuned our RF to use the "best" parameters
#
# Now, we use ALL available data to train our final predictive model:
#

# we should have best_depth and best_num_trees
rforest_model_final = ensemble.RandomForestClassifier(max_depth=best_depth, 
                                                      n_estimators=best_num_trees)

# we train the model (it's one line!)
rforest_model_final.fit(X_all, y_all)              # yay!  trained!
print(f"Built an RF classifier with depth={best_depth} and ntrees={best_num_trees}") 

Built an RF classifier with depth=6 and ntrees=30


In [45]:
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = dtree_model_final.predict(X_all_test)
actual_labels = y_all_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.\n\n")

Predicted labels: [1. 1. 1. ... 0. 0. 1.]
Actual labels: [0. 0. 0. ... 1. 0. 0.]

Results on test set:  52676 correct out of 111716 total, which is  0.4715.




In [46]:
#
# feature importances are often even more "important" than predictions...
#
#    Random forests can provide a much "smoother" measure of feature importance, since
#                   they integrate over so many individual models (each tree)
#
#    That is, it's much less likely that a feature will have 0% importance, 
#             unless it never varies
#

print(rforest_model_final.feature_importances_)
print()

# let's see them with each feature name:
IMPs = rforest_model_final.feature_importances_

# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    print(f"Feature {COLUMNS[i]:>12s} has {perc:>7.2f}% of the decision-making importance.")

[0.03880944 0.0315299  0.05595399 0.05863804 0.0522319  0.16033798
 0.234926   0.36757276]

Feature         Open has    3.88% of the decision-making importance.
Feature         High has    3.15% of the decision-making importance.
Feature          Low has    5.60% of the decision-making importance.
Feature        Close has    5.86% of the decision-making importance.
Feature       Volume has    5.22% of the decision-making importance.
Feature         Year has   16.03% of the decision-making importance.
Feature        Month has   23.49% of the decision-making importance.
Feature          Day has   36.76% of the decision-making importance.


In [68]:
# save the classifier

with open('rforest_model_final.pkl', 'wb') as fid:
    pickle.dump(rforest_model_final, fid) 

### Neural Network

In [47]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  We still create it to be consistent:
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False) # no scaling
    scaler.fit(X_train)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_train_scaled = scaler.transform(X_train) # scale!
X_test_scaled = scaler.transform(X_test) # scale!

y_train_scaled = y_train  # the predicted/desired labels are not scaled
y_test_scaled = y_test  # not using the scaler

def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>58s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>58s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
    
ascii_table(X_train_scaled[0:5,:],y_train_scaled[0:5])

                                                    input  -> pred  des. 
[-0.62228159 -0.61949894 -0.62053342 -0.6189016   0.05002699  0.71498637
  0.74171276 -0.902306  ] -> ?     1    
[-0.15200495 -0.15853714 -0.16484816 -0.17120035 -0.14541927 -0.70714375
 -1.6213479   1.61890759] -> ?     1    
[-0.48575869 -0.48898066 -0.48544214 -0.48766917 -0.13823038 -1.41820882
  0.44633018 -1.47530909] -> ?     1    
[-0.59654367 -0.59682416 -0.59476129 -0.59679635  2.02765657 -1.41820882
  0.74171276 -0.2147023 ] -> ?     1    
[ 0.02983909  0.02064898  0.03113331  0.02438913 -0.1761054  -0.70714375
  1.03709534 -0.67310477] -> ?     0    


In [54]:
from sklearn.neural_network import MLPClassifier

#
# Here's where you can change the number of hidden layers
# and number of neurons!
#
nn_classifier = MLPClassifier(hidden_layer_sizes=(16,8,4),  # 8 input ->  -> 1 output
                    max_iter=100,      # how many times to train
                    activation="tanh", # the "activation function" input -> output
                    solver='sgd',      # the algorithm for optimizing weights
                    verbose=True,      # False to "mute" the training
                    shuffle=True,      # reshuffle the training epochs?
                    random_state=None, # set for reproduceability
                    learning_rate_init=.1,       # learning rate: % of error to backprop
                    learning_rate = 'adaptive')  # soften feedback as it converges

# documentation:
# scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html 
#     Try verbose / activation "relu" / other network sizes ...

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_classifier.fit(X_train_scaled, y_train_scaled)
print("\n++++++++++  TRAINING:   end  +++++++++++++++")
print(f"The analog prediction error (the loss) is {nn_classifier.loss_}")



++++++++++  TRAINING:  begin  +++++++++++++++


Iteration 1, loss = 0.69047908
Iteration 2, loss = 0.68529552
Iteration 3, loss = 0.68079946
Iteration 4, loss = 0.67815136
Iteration 5, loss = 0.67574658
Iteration 6, loss = 0.67421764
Iteration 7, loss = 0.67273740
Iteration 8, loss = 0.67173017
Iteration 9, loss = 0.67071202
Iteration 10, loss = 0.67018057
Iteration 11, loss = 0.66997090
Iteration 12, loss = 0.66966905
Iteration 13, loss = 0.66993277
Iteration 14, loss = 0.66934411
Iteration 15, loss = 0.66948981
Iteration 16, loss = 0.66925607
Iteration 17, loss = 0.66916500
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.020000
Iteration 18, loss = 0.66171907
Iteration 19, loss = 0.66094575
Iteration 20, loss = 0.66099653
Iteration 21, loss = 0.66070045
Iteration 22, loss = 0.66077645
Iteration 23, loss = 0.66069533
Iteration 24, loss = 0.66051355
Iteration 25, loss = 0.66079183
Iteration 26, loss = 0.66036189
Iteration 27

In [70]:
#
# how did it do on the testing data?
#

#
# which one do we want: classifier or regressor?
#

def ascii_table_for_classifier(Xsc,y,nn,scaler):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(Xsc)            # all predictions
    prediction_probs = nn.predict_proba(Xsc) # all prediction probabilities
    Xpr = scaler.inverse_transform(Xsc)      # Xpr is the "X to print": unscaled data!
    # count correct
    num_correct = 0
    # printing
    # print(f"{'input ':>28s} -> {'pred':^6s} {'des.':^6s}") 
    for i in range(len(y)):
        pred = predictions[i]
        pred_probs = prediction_probs[i,:]
        desired = y[i]
        if pred != desired: result = "  incorrect: " + str(pred_probs)
        else: result = "  correct"; num_correct += 1
        # Xpr = Xsc  # if you want to see the scaled versions
        #print(f"{Xpr[i,:]!s:>28s} -> {pred:^6.0f} {desired:^6.0f} {result:^10s}") 
    print(f"\ncorrect predictions: {num_correct} out of {len(y)}, which is {num_correct/len(y):7.4f}.")
    


#
# let's see how it did on the test data (also the training data!)
#
ascii_table_for_classifier(X_train_scaled,
                           y_train_scaled,
                           nn_classifier,
                           scaler)   
#
# other things...
#
if False:  # do we want to see all of the parameters?
    nn = nn_classifier  # less to type?
    print("\n\n+++++ parameters, weights, etc. +++++\n")
    print(f"\nweights/coefficients:\n")
    for wts in nn.coefs_:
        print(wts)
    print(f"\nintercepts: {nn.intercepts_}")
    print(f"\nall parameters: {nn.get_params()}")


correct predictions: 1087090 out of 1859938, which is  0.5845.


In [71]:
#
# let's see how it did on the training data 
#
ascii_table_for_classifier(X_test_scaled,
                           y_test_scaled,
                           nn_classifier,
                           scaler)   
#
# other things...
#
if False:  # do we want to see all of the parameters?
    nn = nn_classifier  # less to type?
    print("\n\n+++++ parameters, weights, etc. +++++\n")
    print(f"\nweights/coefficients:\n")
    for wts in nn.coefs_:
        print(wts)
    print(f"\nintercepts: {nn.intercepts_}")
    print(f"\nall parameters: {nn.get_params()}")


correct predictions: 272105 out of 464985, which is  0.5852.


In [56]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  We still create it to be consistent:
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False) # no scaling
    scaler.fit(X_train)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_all_test_scaled = scaler.transform(X_all_test) # scale!

y_all_test_scaled = y_all_test  # the predicted/desired labels are not scaled


def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>58s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>58s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
    
ascii_table(X_all_test_scaled[0:5,:], y_all_test_scaled[0:5])

                                                    input  -> pred  des. 
[ 0.10873142  0.09862812  0.11468033  0.10581479 -0.1740224   1.42605144
  1.62786051 -1.13150724] -> ?     0    
[-0.5598951  -0.56032328 -0.55851038 -0.56070044  0.16939298  1.42605144
  1.62786051 -1.13150724] -> ?     0    
[-0.06304125 -0.06562582 -0.05666186 -0.0603943  -0.14430156  1.42605144
  1.62786051 -1.13150724] -> ?     0    
[-0.38140819 -0.38334934 -0.37838867 -0.38301901 -0.15568185  1.42605144
  1.62786051 -1.13150724] -> ?     0    
[-0.35091434 -0.34657195 -0.3458195  -0.34748272 -0.17470827  1.42605144
  1.62786051 -1.13150724] -> ?     1    


In [57]:
#
# let's see how it did on the extra test data (also the training data!)
#
ascii_table_for_classifier(X_all_test_scaled,
                           y_all_test_scaled,
                           nn_classifier,
                           scaler) 


correct predictions: 53380 out of 111716, which is  0.4778.


In [58]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_all)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  We still create it to be consistent:
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False) # no scaling
    scaler.fit(X_all)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_all_scaled = scaler.transform(X_all) # scale!
y_all_scaled = y_all  # the predicted/desired labels are not scaled


def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>58s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>58s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
    
ascii_table(X_all_scaled[0:5,:], y_all_scaled[0:5])

                                                    input  -> pred  des. 
[ 0.03899406  0.03549253  0.04776377  0.04137425 -0.16919196 -1.41835118
 -1.62185841 -1.36060155] -> ?     1    
[-0.56650895 -0.56654721 -0.56551035 -0.5656374   0.53716061 -1.41835118
 -1.62185841 -1.36060155] -> ?     1    
[ 0.15528642  0.16120528  0.16379625  0.17222706 -0.10808076 -1.41835118
 -1.62185841 -1.36060155] -> ?     1    
[-0.30317384 -0.29743904 -0.29750362 -0.29190898 -0.17432285 -1.41835118
 -1.62185841 -1.36060155] -> ?     1    
[ 0.1888323   0.19988612  0.20058704  0.20577906 -0.13871294 -1.41835118
 -1.62185841 -1.36060155] -> ?     1    


In [59]:
#
# Here's where you can change the number of hidden layers
# and number of neurons!
#
nn_classifier_final = MLPClassifier(hidden_layer_sizes=(16,8,4),  # 8 input ->  -> 1 output
                    max_iter=100,      # how many times to train
                    activation="tanh", # the "activation function" input -> output
                    solver='sgd',      # the algorithm for optimizing weights
                    verbose=True,      # False to "mute" the training
                    shuffle=True,      # reshuffle the training epochs?
                    random_state=None, # set for reproduceability
                    learning_rate_init=.1,       # learning rate: % of error to backprop
                    learning_rate = 'adaptive')  # soften feedback as it converges

# documentation:
# scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html 
#     Try verbose / activation "relu" / other network sizes ...

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_classifier_final.fit(X_all_scaled, y_all_scaled)
print("\n++++++++++  TRAINING:   end  +++++++++++++++")
print(f"The analog prediction error (the loss) is {nn_classifier.loss_}")



++++++++++  TRAINING:  begin  +++++++++++++++


Iteration 1, loss = 0.69023708
Iteration 2, loss = 0.68487740
Iteration 3, loss = 0.67920752
Iteration 4, loss = 0.67662052
Iteration 5, loss = 0.67513929
Iteration 6, loss = 0.67394997
Iteration 7, loss = 0.67301284
Iteration 8, loss = 0.67148129
Iteration 9, loss = 0.67113311
Iteration 10, loss = 0.67100568
Iteration 11, loss = 0.67057704
Iteration 12, loss = 0.67042704
Iteration 13, loss = 0.67008495
Iteration 14, loss = 0.67077283
Iteration 15, loss = 0.67068042
Iteration 16, loss = 0.67102886
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.020000
Iteration 17, loss = 0.66263190
Iteration 18, loss = 0.66103834
Iteration 19, loss = 0.66093542
Iteration 20, loss = 0.66165592
Iteration 21, loss = 0.66094598
Iteration 22, loss = 0.66092475
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.004000
Iteration 23, loss = 0.65

In [60]:
#
# let's see how it did on the extra test data 
#
ascii_table_for_classifier(X_all_test_scaled,
                           y_all_test_scaled,
                           nn_classifier_final,
                           scaler) 


correct predictions: 56830 out of 111716, which is  0.5087.


In [69]:
# save the classifier

with open('nn_classifier_final.pkl', 'wb') as fid:
    pickle.dump(nn_classifier_final, fid) 

### 10 Classes Experiment

In [5]:
#
# All of the columns need to be numeric, we'll drop Target and twoclass
ROW = 0
COLUMN = 1
df_model1 = df_tidy.drop(['Target', 'twoclass'], axis=COLUMN )
df_model1.head()

Unnamed: 0,Open,High,Low,Close,Volume,Year,Month,Day,multiclasses
0,2734.0,2755.0,2730.0,2742.0,31400,2017,1,4,5
1,568.0,576.0,563.0,571.0,2798500,2017,1,4,7
2,3150.0,3210.0,3140.0,3210.0,270800,2017,1,4,6
3,1510.0,1550.0,1510.0,1550.0,11300,2017,1,4,7
4,3270.0,3350.0,3270.0,3330.0,150800,2017,1,4,5


In [6]:
#
# let's convert our dataframe to a numpy array, named A
#
A = df_model1.to_numpy()   
print(A)

[[2.734e+03 2.755e+03 2.730e+03 ... 1.000e+00 4.000e+00 5.000e+00]
 [5.680e+02 5.760e+02 5.630e+02 ... 1.000e+00 4.000e+00 7.000e+00]
 [3.150e+03 3.210e+03 3.140e+03 ... 1.000e+00 4.000e+00 6.000e+00]
 ...
 [1.690e+03 1.690e+03 1.645e+03 ... 1.200e+01 3.000e+00 4.000e+00]
 [2.388e+03 2.396e+03 2.380e+03 ... 1.200e+01 3.000e+00 6.000e+00]
 [6.900e+02 7.110e+02 6.860e+02 ... 1.200e+01 3.000e+00 8.000e+00]]


In [7]:
#
# let's make sure it's all floating-point (here, it already is, but in other datasets it might not be)
#
A = A.astype('float64')  
print(A)

[[2.734e+03 2.755e+03 2.730e+03 ... 1.000e+00 4.000e+00 5.000e+00]
 [5.680e+02 5.760e+02 5.630e+02 ... 1.000e+00 4.000e+00 7.000e+00]
 [3.150e+03 3.210e+03 3.140e+03 ... 1.000e+00 4.000e+00 6.000e+00]
 ...
 [1.690e+03 1.690e+03 1.645e+03 ... 1.200e+01 3.000e+00 4.000e+00]
 [2.388e+03 2.396e+03 2.380e+03 ... 1.200e+01 3.000e+00 6.000e+00]
 [6.900e+02 7.110e+02 6.860e+02 ... 1.200e+01 3.000e+00 8.000e+00]]


In [8]:
print("+++ Start of data definitions +++\n")

#
# we could do this at the data-frame level, too!
#

X_all = A[:,0:8]  # X (features) ... is all rows except column 8
y_all = A[:,8]    # y (labels) ... is all rows, column 8 only

print(f"y_all (just the labels/species)   are \n {y_all}")
print(f"X_all (just the features, first few rows) are \n {X_all[0:5]}")

+++ Start of data definitions +++

y_all (just the labels/species)   are 
 [5. 7. 6. ... 4. 6. 8.]
X_all (just the features, first few rows) are 
 [[2.7340e+03 2.7550e+03 2.7300e+03 2.7420e+03 3.1400e+04 2.0170e+03
  1.0000e+00 4.0000e+00]
 [5.6800e+02 5.7600e+02 5.6300e+02 5.7100e+02 2.7985e+06 2.0170e+03
  1.0000e+00 4.0000e+00]
 [3.1500e+03 3.2100e+03 3.1400e+03 3.2100e+03 2.7080e+05 2.0170e+03
  1.0000e+00 4.0000e+00]
 [1.5100e+03 1.5500e+03 1.5100e+03 1.5500e+03 1.1300e+04 2.0170e+03
  1.0000e+00 4.0000e+00]
 [3.2700e+03 3.3500e+03 3.2700e+03 3.3300e+03 1.5080e+05 2.0170e+03
  1.0000e+00 4.0000e+00]]


In [9]:
# We next separate into test data and training data ... 
#    + We will train on the training data...
#    + We will _not_ look at the testing data to build the model
#
# Then, afterward, we will test on the testing data -- and see how well we do!
#

#
# a common convention:  train on 80%, test on 20%    Let's define the TEST_PERCENT
#

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows\n" )

print(f"Held-out data... (testing data: {len(y_test)})")
print(f"y_test: {y_test}\n")
print(f"X_test (few rows): {X_test[0:5,:]}")  # 5 rows
print()
print(f"Data used for modeling... (training data: {len(y_train)})")
print(f"y_train: {y_train}\n")
print(f"X_train (few rows): {X_train[0:5,:]}")  # 5 rows

training with 1859938 rows;  testing with 464985 rows

Held-out data... (testing data: 464985)
y_test: [2. 3. 9. ... 7. 6. 7.]

X_test (few rows): [[1.980e+03 1.980e+03 1.951e+03 1.956e+03 7.100e+04 2.019e+03 6.000e+00
  2.400e+01]
 [1.341e+03 1.378e+03 1.331e+03 1.363e+03 5.425e+05 2.018e+03 8.000e+00
  8.000e+00]
 [2.180e+03 2.195e+03 2.148e+03 2.184e+03 4.860e+04 2.021e+03 8.000e+00
  2.000e+01]
 [3.425e+03 3.445e+03 3.275e+03 3.295e+03 4.690e+05 2.021e+03 5.000e+00
  1.200e+01]
 [2.337e+03 2.438e+03 2.301e+03 2.438e+03 2.803e+05 2.020e+03 3.000e+00
  3.000e+01]]

Data used for modeling... (training data: 1859938)
y_train: [9. 9. 8. ... 1. 1. 5.]

X_train (few rows): [[3.6900e+02 3.8500e+02 3.6900e+02 3.8100e+02 8.9090e+05 2.0200e+03
  9.0000e+00 8.0000e+00]
 [2.0500e+03 2.0520e+03 1.9780e+03 1.9810e+03 1.2150e+05 2.0180e+03
  1.0000e+00 3.0000e+01]
 [8.5700e+02 8.5700e+02 8.4600e+02 8.5000e+02 1.4980e+05 2.0170e+03
  8.0000e+00 3.0000e+00]
 [4.6100e+02 4.6700e+02 4.6000e+02 4.6000e

In [10]:
# Also read the extra testing data

filename_test = 'stocks_cleaned_test.csv'
df_tidy_test = pd.read_csv(filename_test)      # encoding = "utf-8", "latin1"
print(f"{filename_test} : file read into a pandas dataframe.")

#
# All of the columns need to be numeric, we'll drop Target and twoclass
ROW = 0
COLUMN = 1
df_model1_test = df_tidy_test.drop(['Target', 'twoclass'], axis=COLUMN )
df_model1_test.head()

#
# let's convert our dataframe to a numpy array, named B
#
B = df_model1_test.to_numpy()   
print(B)

#
# let's make sure it's all floating-point (here, it already is, but in other datasets it might not be)
#
B = B.astype('float64')  
print(B)

#
# nice to have NUM_ROWS and NUM_COLS around
#
NUM_ROWS, NUM_COLS = B.shape
print(f"\nThe dataset has {NUM_ROWS} rows and {NUM_COLS} cols")

print("+++ Start of data definitions +++\n")

#
# we could do this at the data-frame level, too!
#

X_all_test = B[:,0:8]  # X (features) ... is all rows except column 8
y_all_test = B[:,8]    # y (labels) ... is all rows, column 8 only

print(f"y_all (just the labels/species)   are \n {y_all_test}")
print(f"X_all (just the features, first few rows) are \n {X_all_test[0:5]}")



stocks_cleaned_test.csv : file read into a pandas dataframe.
[[2.982e+03 2.982e+03 2.965e+03 ... 1.200e+01 6.000e+00 4.000e+00]
 [5.920e+02 5.990e+02 5.880e+02 ... 1.200e+01 6.000e+00 3.000e+00]
 [2.368e+03 2.388e+03 2.360e+03 ... 1.200e+01 6.000e+00 3.000e+00]
 ...
 [1.600e+03 1.622e+03 1.600e+03 ... 2.000e+00 2.800e+01 6.000e+00]
 [2.568e+03 2.568e+03 2.540e+03 ... 2.000e+00 2.800e+01 4.000e+00]
 [7.310e+02 7.370e+02 7.260e+02 ... 2.000e+00 2.800e+01 0.000e+00]]
[[2.982e+03 2.982e+03 2.965e+03 ... 1.200e+01 6.000e+00 4.000e+00]
 [5.920e+02 5.990e+02 5.880e+02 ... 1.200e+01 6.000e+00 3.000e+00]
 [2.368e+03 2.388e+03 2.360e+03 ... 1.200e+01 6.000e+00 3.000e+00]
 ...
 [1.600e+03 1.622e+03 1.600e+03 ... 2.000e+00 2.800e+01 6.000e+00]
 [2.568e+03 2.568e+03 2.540e+03 ... 2.000e+00 2.800e+01 4.000e+00]
 [7.310e+02 7.370e+02 7.260e+02 ... 2.000e+00 2.800e+01 0.000e+00]]

The dataset has 111716 rows and 9 cols
+++ Start of data definitions +++

y_all (just the labels/species)   are 
 [4. 3. 3

### KNN

In [10]:
#
# to do this, we use "cross validation"
#

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
#
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"
#
best_k = 84  # Not correct!
best_accuracy = 0.0  # also not correct...

# Note that we are cross-validating using only our TEST data!
for k in range(1,20):
    knn_cv_model = KNeighborsClassifier(n_neighbors=k)   # build knn_model for every k!
    cv_scores = cross_val_score( knn_cv_model, X_train, y_train, cv=5 )  # cv=5 means 80/20
    print(cv_scores)  # just to see the five scores... 
    average_cv_accuracy = cv_scores.mean()  # mean() is numpy's built-in average function 
    print(f"k: {k:2d}  cv accuracy: {average_cv_accuracy:7.4f}")

    
# assign best value of k to best_k
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_k = k      # at the moment this is incorrect   
# you'll need to use the loop above to find and remember the real best_k

print(f"best_k = {best_k}   yields the highest average cv accuracy.")  # print the best one

[0.11140293 0.11211114 0.11199316 0.11208456 0.11164996]
k:  1  cv accuracy:  0.1118
[0.11117712 0.1125789  0.1119152  0.11176197 0.11181664]
k:  2  cv accuracy:  0.1118
[0.11030882 0.11051701 0.11036407 0.10934253 0.11000742]
k:  3  cv accuracy:  0.1101
[0.11059915 0.11067024 0.11113292 0.11064903 0.11031926]
k:  4  cv accuracy:  0.1107
[0.11325512 0.11334774 0.11341525 0.1124663  0.11246183]
k:  5  cv accuracy:  0.1130
[0.11545141 0.11578062 0.11604169 0.11539382 0.11493236]
k:  6  cv accuracy:  0.1155
[0.11664498 0.11722152 0.11733206 0.11692075 0.11598886]
k:  7  cv accuracy:  0.1168
[0.11757242 0.11852533 0.11756594 0.11749335 0.11687868]
k:  8  cv accuracy:  0.1176
[0.11838695 0.11851995 0.11774336 0.11745034 0.11791367]
k:  9  cv accuracy:  0.1180
[0.11873105 0.11928342 0.11838048 0.11788046 0.11802389]
k: 10  cv accuracy:  0.1185
[0.11900525 0.11904147 0.11905255 0.11866275 0.11905351]
k: 11  cv accuracy:  0.1190
[0.11996226 0.11997968 0.11997731 0.11984021 0.1195132 ]
k: 12  c

In [12]:
#
# Ok!  We have tuned knn to use the "best" value of k...
#
# And, we should now use ALL available data to train our final predictive model:
#
from sklearn.neighbors import KNeighborsClassifier

knn_model_final = KNeighborsClassifier(n_neighbors=19)   # here, we use the best_k
knn_model_final.fit(X_all, y_all)                              # here we use ALL the data!
print(f"Created + trained a 'final' knn classifier, with a (best) k of {19}") 

Created + trained a 'final' knn classifier, with a (best) k of 19


In [13]:
# Test with extra data
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = knn_model_final.predict(X_all_test)
actual_labels = y_all_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.\n\n")

Predicted labels: [4. 1. 9. ... 4. 6. 4.]
Actual labels: [4. 3. 3. ... 6. 4. 0.]

Results on test set:  13492 correct out of 111716 total, which is  0.1208.




### Decision Tree

In [11]:
#
# To compare different tree-depths, we use cross validation
#
from sklearn import tree      # for decision trees

best_d = 1
best_accuracy = 0.0

for d in range(1,20):
    cv_model = tree.DecisionTreeClassifier(max_depth=d)   # for each depth, d
    cv_scores = cross_val_score( cv_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
    # print(cv_scores)  # we usually don't want to see the five individual scores 
    average_cv_accuracy = cv_scores.mean()  # more likely, only their average
    print(f"depth: {d:2d}  cv accuracy: {average_cv_accuracy:7.4f}")
    
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_d = d

    
    
# assign best value of d to best_depth
best_depth = best_d   # may have to hand-tune this, depending on what happens...
print()
print(f"best_depth = {best_depth} is our choice for an underfitting/overfitting balance.") 

depth:  1  cv accuracy:  0.1546
depth:  2  cv accuracy:  0.1546
depth:  3  cv accuracy:  0.1546
depth:  4  cv accuracy:  0.1585
depth:  5  cv accuracy:  0.1668
depth:  6  cv accuracy:  0.1705
depth:  7  cv accuracy:  0.1751
depth:  8  cv accuracy:  0.1794
depth:  9  cv accuracy:  0.1826
depth: 10  cv accuracy:  0.1872
depth: 11  cv accuracy:  0.1946
depth: 12  cv accuracy:  0.1993
depth: 13  cv accuracy:  0.2050
depth: 14  cv accuracy:  0.2093
depth: 15  cv accuracy:  0.2121
depth: 16  cv accuracy:  0.2123
depth: 17  cv accuracy:  0.2119
depth: 18  cv accuracy:  0.2103
depth: 19  cv accuracy:  0.2077

best_depth = 16 is our choice for an underfitting/overfitting balance.


In [14]:
#
# Ok!  We have tuned our DT to use the "best" depth...
#
# Now, we use ALL available data to train our final predictive model:
#

from sklearn import tree      # for decision trees

# we should have best_depth from our cv exploration
best_depth = 16
dtree_model_final = tree.DecisionTreeClassifier(max_depth=best_depth)

# we train the model (it's one line!)
dtree_model_final.fit(X_all, y_all)                              # yay!  trained!
print("Created and trained a 'final' DT classifier with max depth =", best_depth) 

Created and trained a 'final' DT classifier with max depth = 16


In [15]:
# Test with extra data
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = dtree_model_final.predict(X_all_test)
actual_labels = y_all_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.\n\n")

Predicted labels: [7. 9. 9. ... 4. 4. 4.]
Actual labels: [4. 3. 3. ... 6. 4. 0.]

Results on test set:  12041 correct out of 111716 total, which is  0.1078.




In [18]:
print(dtree_model_final.feature_importances_)
print()

# let's see them with each feature name:
IMPs = dtree_model_final.feature_importances_

# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    print(f"Feature {df_model1.columns[i]:>12s} has {perc:>7.2f}% of the decision-making importance.")

[0.04586071 0.04827758 0.04796631 0.04621788 0.138748   0.04461912
 0.22200167 0.40630873]

Feature         Open has    4.59% of the decision-making importance.
Feature         High has    4.83% of the decision-making importance.
Feature          Low has    4.80% of the decision-making importance.
Feature        Close has    4.62% of the decision-making importance.
Feature       Volume has   13.87% of the decision-making importance.
Feature         Year has    4.46% of the decision-making importance.
Feature        Month has   22.20% of the decision-making importance.
Feature          Day has   40.63% of the decision-making importance.


### Random Forest

In [12]:
#
# So, to compare different parameters, let's use cv
#
from sklearn import ensemble  # for random forests, an ensemble classifier

best_d = 1
best_ntrees = 10   
best_accuracy = 0

for d in range(1,5):
    for ntrees in range(10,60,20):
        rforest_model = ensemble.RandomForestClassifier(max_depth=d, 
                                                        n_estimators=ntrees,
                                                        verbose=1)
        cv_scores = cross_val_score( rforest_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
        average_cv_accuracy = cv_scores.mean()  # more likely, only their average
        print(f"depth: {d:2d} ntrees: {ntrees:3d} cv accuracy: {average_cv_accuracy:7.4f}")

        if average_cv_accuracy > best_accuracy:
            best_d = d
            best_ntrees = ntrees
            best_accuracy = average_cv_accuracy


best_depth = best_d   
best_num_trees = best_ntrees


print()
print(f"best_depth: {best_depth} and best_num_trees: {best_num_trees} are our choices.")  



  from numpy.core.umath_tests import inner1d
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   11.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   11.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   10.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


depth:  1 ntrees:  10 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   28.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   29.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   29.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   30.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   28.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.6s finished


depth:  1 ntrees:  30 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   48.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   47.4s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   49.2s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   48.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   47.4s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished


depth:  1 ntrees:  50 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


depth:  2 ntrees:  10 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   49.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   54.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   45.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   48.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   48.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished


depth:  2 ntrees:  30 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.1s finished


depth:  2 ntrees:  50 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   25.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   25.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   23.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   23.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   25.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.6s finished


depth:  3 ntrees:  10 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.3s finished


depth:  3 ntrees:  30 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.7s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.7s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.2s finished


depth:  3 ntrees:  50 cv accuracy:  0.1546


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   30.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   30.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   30.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   31.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   29.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


depth:  4 ntrees:  10 cv accuracy:  0.1553


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.4s finished


depth:  4 ntrees:  30 cv accuracy:  0.1553


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.6min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.4min finished


depth:  4 ntrees:  50 cv accuracy:  0.1566

best_depth: 4 and best_num_trees: 50 are our choices.


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.8s finished


In [20]:
#
# Ok!  We have tuned our RF to use the "best" parameters
#
# Now, we use ALL available data to train our final predictive model:
#
from sklearn import ensemble  # for random forests, an ensemble classifier


best_depth = 4
best_num_trees = 50

# we should have best_depth and best_num_trees
rforest_model_final = ensemble.RandomForestClassifier(max_depth=best_depth, 
                                                      n_estimators=best_num_trees)

# we train the model (it's one line!)
rforest_model_final.fit(X_all, y_all)              # yay!  trained!
print(f"Built an RF classifier with depth={best_depth} and ntrees={best_num_trees}") 

  from numpy.core.umath_tests import inner1d


Built an RF classifier with depth=4 and ntrees=50


In [32]:
# Test with extra data
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = rforest_model_final.predict(X_all_test)
actual_labels = y_all_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.\n\n")

Predicted labels: [4. 4. 4. ... 4. 4. 4.]
Actual labels: [4. 3. 3. ... 6. 4. 0.]

Results on test set:  15934 correct out of 111716 total, which is  0.1426.




In [22]:
#
# feature importances are often even more "important" than predictions...
#
#    Random forests can provide a much "smoother" measure of feature importance, since
#                   they integrate over so many individual models (each tree)
#
#    That is, it's much less likely that a feature will have 0% importance, 
#             unless it never varies
#

print(rforest_model_final.feature_importances_)
print()

# let's see them with each feature name:
IMPs = rforest_model_final.feature_importances_

# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    print(f"Feature {df_model1.columns[i]:>12s} has {perc:>7.2f}% of the decision-making importance.")

[0.03443071 0.04541777 0.05099852 0.05017958 0.40259039 0.3206969
 0.04601593 0.04967019]

Feature         Open has    3.44% of the decision-making importance.
Feature         High has    4.54% of the decision-making importance.
Feature          Low has    5.10% of the decision-making importance.
Feature        Close has    5.02% of the decision-making importance.
Feature       Volume has   40.26% of the decision-making importance.
Feature         Year has   32.07% of the decision-making importance.
Feature        Month has    4.60% of the decision-making importance.
Feature          Day has    4.97% of the decision-making importance.


### Neural Networks

In [13]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  We still create it to be consistent:
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False) # no scaling
    scaler.fit(X_train)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_train_scaled = scaler.transform(X_train) # scale!
X_test_scaled = scaler.transform(X_test) # scale!

y_train_scaled = y_train  # the predicted/desired labels are not scaled
y_test_scaled = y_test  # not using the scaler

def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>58s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>58s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
    
ascii_table(X_train_scaled[0:5,:],y_train_scaled[0:5])

                                                    input  -> pred  des. 
[-0.62228159 -0.61949894 -0.62053342 -0.6189016   0.05002699  0.71498637
  0.74171276 -0.902306  ] -> ?     9    
[-0.15200495 -0.15853714 -0.16484816 -0.17120035 -0.14541927 -0.70714375
 -1.6213479   1.61890759] -> ?     9    
[-0.48575869 -0.48898066 -0.48544214 -0.48766917 -0.13823038 -1.41820882
  0.44633018 -1.47530909] -> ?     8    
[-0.59654367 -0.59682416 -0.59476129 -0.59679635  2.02765657 -1.41820882
  0.74171276 -0.2147023 ] -> ?     8    
[ 0.02983909  0.02064898  0.03113331  0.02438913 -0.1761054  -0.70714375
  1.03709534 -0.67310477] -> ?     4    


In [14]:
from sklearn.neural_network import MLPClassifier

#
# Here's where you can change the number of hidden layers
# and number of neurons!
#
nn_classifier = MLPClassifier(hidden_layer_sizes=(16,8,4),  # 8 input ->  -> 1 output
                    max_iter=100,      # how many times to train
                    activation="tanh", # the "activation function" input -> output
                    solver='sgd',      # the algorithm for optimizing weights
                    verbose=True,      # False to "mute" the training
                    shuffle=True,      # reshuffle the training epochs?
                    random_state=None, # set for reproduceability
                    learning_rate_init=.1,       # learning rate: % of error to backprop
                    learning_rate = 'adaptive')  # soften feedback as it converges

# documentation:
# scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html 
#     Try verbose / activation "relu" / other network sizes ...

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_classifier.fit(X_train_scaled, y_train_scaled)
print("\n++++++++++  TRAINING:   end  +++++++++++++++")
print(f"The analog prediction error (the loss) is {nn_classifier.loss_}")



++++++++++  TRAINING:  begin  +++++++++++++++


Iteration 1, loss = 2.25731746
Iteration 2, loss = 2.24704572
Iteration 3, loss = 2.24092065
Iteration 4, loss = 2.23563474
Iteration 5, loss = 2.23113552
Iteration 6, loss = 2.22588053
Iteration 7, loss = 2.22358513
Iteration 8, loss = 2.22269855
Iteration 9, loss = 2.22015837
Iteration 10, loss = 2.21979266
Iteration 11, loss = 2.21947905
Iteration 12, loss = 2.21847015
Iteration 13, loss = 2.21813813
Iteration 14, loss = 2.21960334
Iteration 15, loss = 2.21763524
Iteration 16, loss = 2.21622082
Iteration 17, loss = 2.21607233
Iteration 18, loss = 2.21562168
Iteration 19, loss = 2.21487446
Iteration 20, loss = 2.21742680
Iteration 21, loss = 2.21565625
Iteration 22, loss = 2.21629087
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.020000
Iteration 23, loss = 2.19851806
Iteration 24, loss = 2.19588128
Iteration 25, loss = 2.19518733
Iteration 26, loss = 2.19493336
Iteration 27

In [15]:
#
# how did it do on the testing data?
#

#
# which one do we want: classifier or regressor?
#

def ascii_table_for_classifier(Xsc,y,nn,scaler):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(Xsc)            # all predictions
    prediction_probs = nn.predict_proba(Xsc) # all prediction probabilities
    Xpr = scaler.inverse_transform(Xsc)      # Xpr is the "X to print": unscaled data!
    # count correct
    num_correct = 0
    # printing
    # print(f"{'input ':>28s} -> {'pred':^6s} {'des.':^6s}") 
    for i in range(len(y)):
        pred = predictions[i]
        pred_probs = prediction_probs[i,:]
        desired = y[i]
        if pred != desired: result = "  incorrect: " + str(pred_probs)
        else: result = "  correct"; num_correct += 1
        # Xpr = Xsc  # if you want to see the scaled versions
        #print(f"{Xpr[i,:]!s:>28s} -> {pred:^6.0f} {desired:^6.0f} {result:^10s}") 
    print(f"\ncorrect predictions: {num_correct} out of {len(y)}, which is {num_correct/len(y):7.4f}.")
    


#
# let's see how it did on the test data (also the training data!)
#
ascii_table_for_classifier(X_train_scaled,
                           y_train_scaled,
                           nn_classifier,
                           scaler)   
#
# other things...
#
if False:  # do we want to see all of the parameters?
    nn = nn_classifier  # less to type?
    print("\n\n+++++ parameters, weights, etc. +++++\n")
    print(f"\nweights/coefficients:\n")
    for wts in nn.coefs_:
        print(wts)
    print(f"\nintercepts: {nn.intercepts_}")
    print(f"\nall parameters: {nn.get_params()}")


correct predictions: 361644 out of 1859938, which is  0.1944.


In [23]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_all)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  We still create it to be consistent:
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False) # no scaling
    scaler.fit(X_all)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_all_scaled = scaler.transform(X_all) # scale!
y_all_scaled = y_all  # the predicted/desired labels are not scaled


def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>58s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>58s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
    
ascii_table(X_all_scaled[0:5,:], y_all_scaled[0:5])

                                                    input  -> pred  des. 
[ 0.03899406  0.03549253  0.04776377  0.04137425 -0.16919196 -1.41835118
 -1.62185841 -1.36060155] -> ?     5    
[-0.56650895 -0.56654721 -0.56551035 -0.5656374   0.53716061 -1.41835118
 -1.62185841 -1.36060155] -> ?     7    
[ 0.15528642  0.16120528  0.16379625  0.17222706 -0.10808076 -1.41835118
 -1.62185841 -1.36060155] -> ?     6    
[-0.30317384 -0.29743904 -0.29750362 -0.29190898 -0.17432285 -1.41835118
 -1.62185841 -1.36060155] -> ?     7    
[ 0.1888323   0.19988612  0.20058704  0.20577906 -0.13871294 -1.41835118
 -1.62185841 -1.36060155] -> ?     5    


In [25]:
from sklearn.neural_network import MLPClassifier

#
# Here's where you can change the number of hidden layers
# and number of neurons!
#
nn_classifier_final = MLPClassifier(hidden_layer_sizes=(16,8,4),  # 8 input ->  -> 1 output
                    max_iter=100,      # how many times to train
                    activation="tanh", # the "activation function" input -> output
                    solver='sgd',      # the algorithm for optimizing weights
                    verbose=True,      # False to "mute" the training
                    shuffle=True,      # reshuffle the training epochs?
                    random_state=None, # set for reproduceability
                    learning_rate_init=.1,       # learning rate: % of error to backprop
                    learning_rate = 'adaptive')  # soften feedback as it converges

# documentation:
# scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html 
#     Try verbose / activation "relu" / other network sizes ...

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_classifier_final.fit(X_all_scaled, y_all_scaled)
print("\n++++++++++  TRAINING:   end  +++++++++++++++")
print(f"The analog prediction error (the loss) is {nn_classifier_final.loss_}")



++++++++++  TRAINING:  begin  +++++++++++++++


Iteration 1, loss = 2.25603226
Iteration 2, loss = 2.23932127
Iteration 3, loss = 2.22949449
Iteration 4, loss = 2.22317488
Iteration 5, loss = 2.21951116
Iteration 6, loss = 2.21758905
Iteration 7, loss = 2.21606429
Iteration 8, loss = 2.21477007
Iteration 9, loss = 2.21372226
Iteration 10, loss = 2.21279148
Iteration 11, loss = 2.21171109
Iteration 12, loss = 2.21100545
Iteration 13, loss = 2.20985326
Iteration 14, loss = 2.20952590
Iteration 15, loss = 2.20900038
Iteration 16, loss = 2.20844567
Iteration 17, loss = 2.20841703
Iteration 18, loss = 2.20791978
Iteration 19, loss = 2.20747097
Iteration 20, loss = 2.20719127
Iteration 21, loss = 2.20740549
Iteration 22, loss = 2.20694187
Iteration 23, loss = 2.20731050
Iteration 24, loss = 2.20712924
Iteration 25, loss = 2.20639371
Iteration 26, loss = 2.20633515
Iteration 27, loss = 2.20647400
Iteration 28, loss = 2.20619339
Iteration 29, loss = 2.20620025
Iteration 30, loss = 2.20751321

NameError: name 'nn_classifier' is not defined

In [27]:
def ascii_table_for_classifier(Xsc,y,nn,scaler):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(Xsc)            # all predictions
    prediction_probs = nn.predict_proba(Xsc) # all prediction probabilities
    Xpr = scaler.inverse_transform(Xsc)      # Xpr is the "X to print": unscaled data!
    # count correct
    num_correct = 0
    # printing
    # print(f"{'input ':>28s} -> {'pred':^6s} {'des.':^6s}") 
    for i in range(len(y)):
        pred = predictions[i]
        pred_probs = prediction_probs[i,:]
        desired = y[i]
        if pred != desired: result = "  incorrect: " + str(pred_probs)
        else: result = "  correct"; num_correct += 1
        # Xpr = Xsc  # if you want to see the scaled versions
        #print(f"{Xpr[i,:]!s:>28s} -> {pred:^6.0f} {desired:^6.0f} {result:^10s}") 
    print(f"\ncorrect predictions: {num_correct} out of {len(y)}, which is {num_correct/len(y):7.4f}.")

In [29]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  We still create it to be consistent:
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False) # no scaling
    scaler.fit(X_train)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_all_test_scaled = scaler.transform(X_all_test) # scale!

y_all_test_scaled = y_all_test  # the predicted/desired labels are not scaled


def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>58s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>58s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
    
ascii_table(X_all_test_scaled[0:5,:], y_all_test_scaled[0:5])

                                                    input  -> pred  des. 
[ 0.10873142  0.09862812  0.11468033  0.10581479 -0.1740224   1.42605144
  1.62786051 -1.13150724] -> ?     4    
[-0.5598951  -0.56032328 -0.55851038 -0.56070044  0.16939298  1.42605144
  1.62786051 -1.13150724] -> ?     3    
[-0.06304125 -0.06562582 -0.05666186 -0.0603943  -0.14430156  1.42605144
  1.62786051 -1.13150724] -> ?     3    
[-0.38140819 -0.38334934 -0.37838867 -0.38301901 -0.15568185  1.42605144
  1.62786051 -1.13150724] -> ?     1    
[-0.35091434 -0.34657195 -0.3458195  -0.34748272 -0.17470827  1.42605144
  1.62786051 -1.13150724] -> ?     5    


In [30]:
# Test with extra Data
#
# let's see how it did on the extra test data 
#
ascii_table_for_classifier(X_all_test_scaled,
                           y_all_test_scaled,
                           nn_classifier_final,
                           scaler) 


correct predictions: 18710 out of 111716, which is  0.1675.


### Two Classes
|          |   KNN   | Decision Tree | Random Forest| Neural Network |
| -------- | ------- |  -----------  |  ----------- |  -----------   |
| Training | 51.07 % |    66.96 %    |    53.46 %   |     58.45 %    |
| Test     | 51.87 % |    47.15 %    |    47.15 %   |     50.87 %    |


### Ten Classes
|          |   KNN   | Decision Tree | Random Forest| Neural Network |
| -------- | ------- |  -----------  |  ----------- |  -----------   |
| Training | 12.35 % |    20.23 %    |    15.66 %   |     19.44 %    |
| Test     | 12.08 % |    10.78 %    |    14.26 %   |     16.75 %    |

### Add 2022 Data into training data (Ten classes)

In [34]:
# Combine all data 
# Jan 2017 - Feb 2022

X_together = np.concatenate((X_all, X_all_test))
y_together = np.concatenate((y_all, y_all_test))

In [39]:
# We next separate into test data and training data ... 
#    + We will train on the training data...
#    + We will _not_ look at the testing data to build the model
#
# Then, afterward, we will test on the testing data -- and see how well we do!
#

#
# a common convention:  train on 80%, test on 20%    Let's define the TEST_PERCENT
#

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_together, y_together, test_size=0.2, random_state=42)

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows\n" )

print(f"Held-out data... (testing data: {len(y_test)})")
print(f"y_test: {y_test}\n")
print(f"X_test (few rows): {X_test[0:5,:]}")  # 5 rows
print()
print(f"Data used for modeling... (training data: {len(y_train)})")
print(f"y_train: {y_train}\n")
print(f"X_train (few rows): {X_train[0:5,:]}")  # 5 rows

training with 1949311 rows;  testing with 487328 rows

Held-out data... (testing data: 487328)
y_test: [5. 0. 6. ... 3. 4. 7.]

X_test (few rows): [[1.7610e+03 1.7750e+03 1.7600e+03 1.7680e+03 1.0297e+06 2.0170e+03
  3.0000e+00 1.6000e+01]
 [4.8300e+02 4.8900e+02 4.8100e+02 4.8600e+02 2.7007e+06 2.0180e+03
  7.0000e+00 3.0000e+01]
 [5.4010e+03 5.4260e+03 5.3290e+03 5.3910e+03 1.2346e+06 2.0190e+03
  9.0000e+00 2.7000e+01]
 [9.1600e+02 9.2100e+02 9.0900e+02 9.1400e+02 1.7810e+05 2.0170e+03
  5.0000e+00 1.8000e+01]
 [2.5950e+03 2.6340e+03 2.5860e+03 2.5960e+03 1.0650e+05 2.0210e+03
  6.0000e+00 1.4000e+01]]

Data used for modeling... (training data: 1949311)
y_train: [6. 3. 0. ... 0. 1. 5.]

X_train (few rows): [[1.3940e+03 1.3970e+03 1.3850e+03 1.3860e+03 2.6000e+03 2.0170e+03
  6.0000e+00 2.2000e+01]
 [1.8160e+03 1.8550e+03 1.7780e+03 1.8330e+03 1.9140e+05 2.0190e+03
  3.0000e+00 1.0000e+00]
 [2.9380e+03 3.0150e+03 2.9350e+03 2.9800e+03 4.9338e+06 2.0180e+03
  1.0000e+01 1.1000e+01]
 [

### KNN

In [40]:
#
# to do this, we use "cross validation"
#

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
#
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"
#
best_k = 84  # Not correct!
best_accuracy = 0.0  # also not correct...

# Note that we are cross-validating using only our TEST data!
for k in range(1,20):
    knn_cv_model = KNeighborsClassifier(n_neighbors=k)   # build knn_model for every k!
    cv_scores = cross_val_score( knn_cv_model, X_train, y_train, cv=5 )  # cv=5 means 80/20
    print(cv_scores)  # just to see the five scores... 
    average_cv_accuracy = cv_scores.mean()  # mean() is numpy's built-in average function 
    print(f"k: {k:2d}  cv accuracy: {average_cv_accuracy:7.4f}")

    
# assign best value of k to best_k
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_k = k      # at the moment this is incorrect   
# you'll need to use the loop above to find and remember the real best_k

print(f"best_k = {best_k}   yields the highest average cv accuracy.")  # print the best one

[0.11206899 0.11167769 0.11145482 0.11174781 0.11143003]
k:  1  cv accuracy:  0.1117
[0.11245117 0.11233946 0.11138044 0.11213256 0.11134025]
k:  2  cv accuracy:  0.1119
[0.11066082 0.11113648 0.10990812 0.11051147 0.1097525 ]
k:  3  cv accuracy:  0.1104
[0.11070442 0.11182646 0.11053142 0.11067819 0.10990384]
k:  4  cv accuracy:  0.1107
[0.11413123 0.11368865 0.11279889 0.11288668 0.11219697]
k:  5  cv accuracy:  0.1131
[0.11618065 0.11557137 0.11506379 0.11546966 0.11486973]
k:  6  cv accuracy:  0.1154
[0.1177145  0.1170052  0.11647711 0.11680347 0.11676786]
k:  7  cv accuracy:  0.1170
[0.11815568 0.11743869 0.11664897 0.11731134 0.11699358]
k:  8  cv accuracy:  0.1173
[0.11833266 0.11726683 0.11721584 0.11732673 0.11719365]
k:  9  cv accuracy:  0.1175
[0.11885335 0.11797734 0.11761598 0.11762171 0.11732447]
k: 10  cv accuracy:  0.1179
[0.11927914 0.11826206 0.11842652 0.11798851 0.11808628]
k: 11  cv accuracy:  0.1184
[0.12015893 0.11924445 0.1190216  0.11914533 0.11901995]
k: 12  c

In [45]:
#
# With the best k, we build and train a new model:
#
# Now, we use best_k instead of the original, randomly-guessed value    
#
best_k = 19
from sklearn.neighbors import KNeighborsClassifier
knn_model_tuned = KNeighborsClassifier(n_neighbors=best_k)   # here, we use the best_k!

# we train the model (one line!)
knn_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print(f"Created + trained a knn classifier, now tuned with a (best) k of {best_k}")  

# How does it do?!  The next cell will show...

Created + trained a knn classifier, now tuned with a (best) k of 19


In [53]:
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = knn_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.\n\n")

Predicted labels: [4. 9. 4. ... 6. 3. 5.]
Actual labels: [5. 0. 6. ... 3. 4. 7.]

Results on test set:  60573 correct out of 487328 total.




In [52]:
#
# Predictions on data in 2022
#

predicted_labels = predicted_labels = knn_model_tuned.predict(X_test)
correct_2022 = 0
total_2022 = 0
for i in range(len(X_test)):
    if X_test[i][5] == 2022:
        if y_test[i] == predicted_labels[i]:
            correct_2022 += 1
        total_2022 += 1

print(f"\nResults on test set:  {correct_2022} correct out of {total_2022} total.\n\n")


Results on test set:  1834 correct out of 14774 total.




### Decision Tree

In [41]:
#
# To compare different tree-depths, we use cross validation
#
from sklearn import tree      # for decision trees

best_d = 1
best_accuracy = 0.0

for d in range(1,20):
    cv_model = tree.DecisionTreeClassifier(max_depth=d)   # for each depth, d
    cv_scores = cross_val_score( cv_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
    # print(cv_scores)  # we usually don't want to see the five individual scores 
    average_cv_accuracy = cv_scores.mean()  # more likely, only their average
    print(f"depth: {d:2d}  cv accuracy: {average_cv_accuracy:7.4f}")
    
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_d = d

    
    
# assign best value of d to best_depth
best_depth = best_d   # may have to hand-tune this, depending on what happens...
print()
print(f"best_depth = {best_depth} is our choice for an underfitting/overfitting balance.") 

depth:  1  cv accuracy:  0.1539
depth:  2  cv accuracy:  0.1539
depth:  3  cv accuracy:  0.1539
depth:  4  cv accuracy:  0.1563
depth:  5  cv accuracy:  0.1625
depth:  6  cv accuracy:  0.1667
depth:  7  cv accuracy:  0.1708
depth:  8  cv accuracy:  0.1766
depth:  9  cv accuracy:  0.1806
depth: 10  cv accuracy:  0.1858
depth: 11  cv accuracy:  0.1917
depth: 12  cv accuracy:  0.1979
depth: 13  cv accuracy:  0.2042
depth: 14  cv accuracy:  0.2086
depth: 15  cv accuracy:  0.2114
depth: 16  cv accuracy:  0.2122
depth: 17  cv accuracy:  0.2116
depth: 18  cv accuracy:  0.2099
depth: 19  cv accuracy:  0.2071

best_depth = 16 is our choice for an underfitting/overfitting balance.


In [54]:
#
# Now, we re-create and re-run the  "Model-building and -training Cell"
#
# this time, with the best depth, best_d, found by cross-validation model tuning:
#
best_depth = 16
# we should have best_depth from our cv exploration
dtree_model_tuned = tree.DecisionTreeClassifier(max_depth=best_depth)

# we train the model (it's one line!)
dtree_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print("Created and trained a DT classifier with max depth =", best_depth) 

Created and trained a DT classifier with max depth = 16


In [55]:
#
# +++ This cell will "Model-testing Cell"
#
# Now, let's see how well our model does on our "held-out data" (the testing data)
#

# We run our test set:

# the function knn_model.predict is the instantiation of our model
# it's what runs the k-nearest-neighbors algorithm:
predicted_labels = dtree_model_tuned.predict(X_test)   
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total, which is {num_correct/total:7.4f}.")

Predicted labels: [4. 9. 8. ... 9. 4. 6.]
Actual  labels  : [5. 0. 6. ... 3. 4. 7.]

Results on test set:  102964 correct out of 487328 total, which is  0.2113.


In [56]:
#
# Predictions on data in 2022
#

predicted_labels = predicted_labels = dtree_model_tuned.predict(X_test)
correct_2022 = 0
total_2022 = 0
for i in range(len(X_test)):
    if X_test[i][5] == 2022:
        if y_test[i] == predicted_labels[i]:
            correct_2022 += 1
        total_2022 += 1

print(f"\nResults on test set:  {correct_2022} correct out of {total_2022} total.\n\n")


Results on test set:  3450 correct out of 14774 total.




In [66]:
print(dtree_model_tuned.feature_importances_)
print()

# let's see them with each feature name:
IMPs = dtree_model_tuned.feature_importances_

# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    print(f"Feature {COLUMNS[i]:>12s} has {perc:>7.2f}% of the decision-making importance.")

[0.04993694 0.05450757 0.05258089 0.05129034 0.14691065 0.05825783
 0.22482789 0.36168789]

Feature         Open has    4.99% of the decision-making importance.
Feature         High has    5.45% of the decision-making importance.
Feature          Low has    5.26% of the decision-making importance.
Feature        Close has    5.13% of the decision-making importance.
Feature       Volume has   14.69% of the decision-making importance.
Feature         Year has    5.83% of the decision-making importance.
Feature        Month has   22.48% of the decision-making importance.
Feature          Day has   36.17% of the decision-making importance.


### Random Forest

In [44]:
#
# So, to compare different parameters, let's use cv
#
from sklearn import ensemble  # for random forests, an ensemble classifier

best_d = 1
best_ntrees = 10   
best_accuracy = 0

for d in range(1,10):
    for ntrees in range(10,60,20):
        rforest_model = ensemble.RandomForestClassifier(max_depth=d, 
                                                        n_estimators=ntrees,
                                                        verbose=1)
        cv_scores = cross_val_score( rforest_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
        average_cv_accuracy = cv_scores.mean()  # more likely, only their average
        print(f"depth: {d:2d} ntrees: {ntrees:3d} cv accuracy: {average_cv_accuracy:7.4f}")

        if average_cv_accuracy > best_accuracy:
            best_d = d
            best_ntrees = ntrees
            best_accuracy = average_cv_accuracy


best_depth = best_d   
best_num_trees = best_ntrees


print()
print(f"best_depth: {best_depth} and best_num_trees: {best_num_trees} are our choices.")  

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   10.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   10.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   11.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


depth:  1 ntrees:  10 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   28.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   26.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   25.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   22.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   23.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.9s finished


depth:  1 ntrees:  30 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   35.5s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.5s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   35.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished


depth:  1 ntrees:  50 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


depth:  2 ntrees:  10 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   38.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   39.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   39.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   39.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   38.6s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished


depth:  2 ntrees:  30 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.6s finished


depth:  2 ntrees:  50 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   19.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   19.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


depth:  3 ntrees:  10 cv accuracy:  0.1543


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   53.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   55.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   56.7s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   54.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   55.0s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished


depth:  3 ntrees:  30 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.7s finished


depth:  3 ntrees:  50 cv accuracy:  0.1539


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   23.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   24.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   22.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   23.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   23.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


depth:  4 ntrees:  10 cv accuracy:  0.1560


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.1s finished


depth:  4 ntrees:  30 cv accuracy:  0.1558


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.8s finished


depth:  4 ntrees:  50 cv accuracy:  0.1567


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   28.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   28.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   28.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   28.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   29.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


depth:  5 ntrees:  10 cv accuracy:  0.1598


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished


depth:  5 ntrees:  30 cv accuracy:  0.1592


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished


depth:  5 ntrees:  50 cv accuracy:  0.1601


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   34.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   34.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   33.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   35.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   35.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


depth:  6 ntrees:  10 cv accuracy:  0.1615


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.2s finished


depth:  6 ntrees:  30 cv accuracy:  0.1615


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.6s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.0min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.1s finished


depth:  6 ntrees:  50 cv accuracy:  0.1609


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   39.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   41.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   41.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   43.7s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   42.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


depth:  7 ntrees:  10 cv accuracy:  0.1645


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.3s finished


depth:  7 ntrees:  30 cv accuracy:  0.1644


KeyboardInterrupt: 

In [57]:
#
# Now, we re-create and re-run the  "Model-building and -training Cell"
#

best_depth = 7
best_num_trees = 10
# we should have best_depth and best_num_trees
rforest_model_tuned = ensemble.RandomForestClassifier(max_depth=best_depth, 
                                                      n_estimators=best_num_trees)

# we train the model (it's one line!)
rforest_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print(f"Built an RF classifier with depth={best_depth} and ntrees={best_num_trees}") 

Built an RF classifier with depth=7 and ntrees=10


In [58]:
#
# +++ This is our "Model-testing Cell"
#
# Now, let's see how well we did on our "held-out data" (the testing data)
#

# We run our test set!
predicted_labels = rforest_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.")

Predicted labels: [4. 4. 4. ... 9. 4. 4.]
Actual  labels  : [5. 0. 6. ... 3. 4. 7.]

Results on test set:  81443 correct out of 487328 total.


In [59]:
#
# Predictions on data in 2022
#

predicted_labels = rforest_model_tuned.predict(X_test)
correct_2022 = 0
total_2022 = 0
for i in range(len(X_test)):
    if X_test[i][5] == 2022:
        if y_test[i] == predicted_labels[i]:
            correct_2022 += 1
        total_2022 += 1

print(f"\nResults on test set:  {correct_2022} correct out of {total_2022} total.\n\n")


Results on test set:  2414 correct out of 14774 total.




In [65]:
#
# feature importances are often even more "important" than predictions...
#
#    Random forests can provide a much "smoother" measure of feature importance, since
#                   they integrate over so many individual models (each tree)
#
#    That is, it's much less likely that a feature will have 0% importance, 
#             unless it never varies
#

print(rforest_model_tuned.feature_importances_)
print()

# let's see them with each feature name:
IMPs = rforest_model_tuned.feature_importances_

# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    print(f"Feature {COLUMNS[i]:>12s} has {perc:>7.2f}% of the decision-making importance.")

[0.02704365 0.03163143 0.05063644 0.04105016 0.36604529 0.30715154
 0.07965982 0.09678167]

Feature         Open has    2.70% of the decision-making importance.
Feature         High has    3.16% of the decision-making importance.
Feature          Low has    5.06% of the decision-making importance.
Feature        Close has    4.11% of the decision-making importance.
Feature       Volume has   36.60% of the decision-making importance.
Feature         Year has   30.72% of the decision-making importance.
Feature        Month has    7.97% of the decision-making importance.
Feature          Day has    9.68% of the decision-making importance.


### Neural Network

In [42]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  We still create it to be consistent:
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False) # no scaling
    scaler.fit(X_train)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_train_scaled = scaler.transform(X_train) # scale!
X_test_scaled = scaler.transform(X_test) # scale!

y_train_scaled = y_train  # the predicted/desired labels are not scaled
y_test_scaled = y_test  # not using the scaler

def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>58s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>58s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
    
ascii_table(X_train_scaled[0:5,:],y_train_scaled[0:5])

                                                    input  -> pred  des. 
[-0.33353667 -0.33759165 -0.33084057 -0.33566615 -0.17773678 -1.42468379
 -0.12196947  0.7022332 ] -> ?     6    
[-0.21692752 -0.21250959 -0.22088545 -0.21210996 -0.12889012 -0.07870281
 -0.98203158 -1.71050173] -> ?     3    
[ 0.09310911  0.10429214  0.10282464  0.10493468  1.09807162 -0.7516933
  1.02478001 -0.56158033] -> ?     0    
[-0.16332048 -0.17017832 -0.16017232 -0.16594914 -0.16847454 -0.07870281
  1.59815476  0.35755678] -> ?     4    
[ 0.89224096  0.90039304  0.9100015   0.90652968 -0.08601994  1.26727816
 -1.55540633  0.24266464] -> ?     2    


In [43]:
from sklearn.neural_network import MLPClassifier

#
# Here's where you can change the number of hidden layers
# and number of neurons!
#
nn_classifier = MLPClassifier(hidden_layer_sizes=(16,8,4),  # 8 input ->  -> 1 output
                    max_iter=100,      # how many times to train
                    activation="tanh", # the "activation function" input -> output
                    solver='sgd',      # the algorithm for optimizing weights
                    verbose=True,      # False to "mute" the training
                    shuffle=True,      # reshuffle the training epochs?
                    random_state=None, # set for reproduceability
                    learning_rate_init=.1,       # learning rate: % of error to backprop
                    learning_rate = 'adaptive')  # soften feedback as it converges

# documentation:
# scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html 
#     Try verbose / activation "relu" / other network sizes ...

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_classifier.fit(X_train_scaled, y_train_scaled)
print("\n++++++++++  TRAINING:   end  +++++++++++++++")
print(f"The analog prediction error (the loss) is {nn_classifier.loss_}")



++++++++++  TRAINING:  begin  +++++++++++++++


Iteration 1, loss = 2.25806829
Iteration 2, loss = 2.24424242
Iteration 3, loss = 2.23463387
Iteration 4, loss = 2.22816392
Iteration 5, loss = 2.22408751
Iteration 6, loss = 2.22111155
Iteration 7, loss = 2.21915945
Iteration 8, loss = 2.21800190
Iteration 9, loss = 2.21666596
Iteration 10, loss = 2.21611800
Iteration 11, loss = 2.21587675
Iteration 12, loss = 2.21496308
Iteration 13, loss = 2.21428994
Iteration 14, loss = 2.21376214
Iteration 15, loss = 2.21473252
Iteration 16, loss = 2.21389073
Iteration 17, loss = 2.21323910
Iteration 18, loss = 2.21373462
Iteration 19, loss = 2.21329928
Iteration 20, loss = 2.21259369
Iteration 21, loss = 2.21199246
Iteration 22, loss = 2.21406178
Iteration 23, loss = 2.21207935
Iteration 24, loss = 2.21187909
Iteration 25, loss = 2.21138981
Iteration 26, loss = 2.21194595
Iteration 27, loss = 2.21163981
Iteration 28, loss = 2.21046230
Iteration 29, loss = 2.21273963
Iteration 30, loss = 2.21327557

In [67]:
#
# how did it do on the training data?
#

#
# which one do we want: classifier or regressor?
#

def ascii_table_for_classifier(Xsc,y,nn,scaler):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(Xsc)            # all predictions
    prediction_probs = nn.predict_proba(Xsc) # all prediction probabilities
    Xpr = scaler.inverse_transform(Xsc)      # Xpr is the "X to print": unscaled data!
    # count correct
    num_correct = 0
    # printing
    # print(f"{'input ':>28s} -> {'pred':^6s} {'des.':^6s}") 
    for i in range(len(y)):
        pred = predictions[i]
        pred_probs = prediction_probs[i,:]
        desired = y[i]
        if pred != desired: result = "  incorrect: " + str(pred_probs)
        else: result = "  correct"; num_correct += 1
        # Xpr = Xsc  # if you want to see the scaled versions
        #print(f"{Xpr[i,:]!s:>28s} -> {pred:^6.0f} {desired:^6.0f} {result:^10s}") 
    print(f"\ncorrect predictions: {num_correct} out of {len(y)}, which is {num_correct/len(y):7.4f}.")
    


#
# let's see how it did on the test data (also the training data!)
#
ascii_table_for_classifier(X_train_scaled,
                           y_train_scaled,
                           nn_classifier,
                           scaler)   
#
# other things...
#
if False:  # do we want to see all of the parameters?
    nn = nn_classifier  # less to type?
    print("\n\n+++++ parameters, weights, etc. +++++\n")
    print(f"\nweights/coefficients:\n")
    for wts in nn.coefs_:
        print(wts)
    print(f"\nintercepts: {nn.intercepts_}")
    print(f"\nall parameters: {nn.get_params()}")


correct predictions: 377789 out of 1949311, which is  0.1938.


In [68]:
#
# let's see how it did on the testing data 
#
ascii_table_for_classifier(X_test_scaled,
                           y_test_scaled,
                           nn_classifier,
                           scaler)   
#
# other things...
#
if False:  # do we want to see all of the parameters?
    nn = nn_classifier  # less to type?
    print("\n\n+++++ parameters, weights, etc. +++++\n")
    print(f"\nweights/coefficients:\n")
    for wts in nn.coefs_:
        print(wts)
    print(f"\nintercepts: {nn.intercepts_}")
    print(f"\nall parameters: {nn.get_params()}")


correct predictions: 93860 out of 487328, which is  0.1926.


In [69]:
#
# Predictions on data in 2022
#

predicted_labels = nn_classifier.predict(X_test_scaled)
correct_2022 = 0
total_2022 = 0
for i in range(len(X_test)):
    if X_test[i][5] == 2022:
        if y_test[i] == predicted_labels[i]:
            correct_2022 += 1
        total_2022 += 1

print(f"\nResults on test set:  {correct_2022} correct out of {total_2022} total.\n\n")


Results on test set:  2652 correct out of 14774 total.




### Training with 2022 data (ten classes)

|          |   KNN   | Decision Tree | Random Forest| Neural Network |
| -------- | ------- |  -----------  |  ----------- |  -----------   |
| Training | 12.27 % |    21.22 %    |    16.45 %   |     19.38 %    |
| Testing  | 12.43 % |    21.13 %    |    16.71 %   |     19.26 %    |
|   2022   | 12.41 % |    23.35 %    |    16.34 %   |     17.95 %    |
| 2022 (original) | 12.08 % |    10.78 %    |    14.26 %   |     16.75 %    |