> __Purpose__: To run various regression ML algorithms, find best performers, and further optimize by tuning the hyperparameters of successful models.
<br>

> To Do: 
 -  Go through and read assumptions of each model, for which should be most applicable
 - Grid/random search?
 - Functionalize current code so it can easily scale up for more data

In [1]:
import pandas as pd
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time
import datetime
import os

title_font_size = 30
label_font_size = 20

plt.rc('font', size=title_font_size) #controls default text size
plt.rc('axes', titlesize=title_font_size) #fontsize of the title
plt.rc('axes', labelsize=label_font_size) #fontsize of the x and y labels
plt.rc('xtick', labelsize=label_font_size) #fontsize of the x tick labels
plt.rc('ytick', labelsize=label_font_size) #fontsize of the y tick labels
plt.rc('legend', fontsize=label_font_size) #fontsize of the legend

In [2]:
# This really ought to be functionalized, especially as more data comes in

# RAW DATA FILES
mat_94b = loadmat('data_94b.mat')
mat_95q = loadmat('data_95q.mat')
# LABELS
y_train_reg95 = np.load(os.path.join('Labels', '95_reg.npy'))
y_train_reg94b = np.load(os.path.join('Labels', '94b_reg.npy'))

# Matrix inits
all_mats = [mat_94b, mat_95q]
mat_names = ["94b", "95"]
num_mats = len(all_mats)

num_vessels_lst = [0] * num_mats
m_rICT = [0] * num_mats
t = [0] * num_mats

labels = [0] * num_mats

# Find max vector sizes... there's probably a better way to do this 
running_max = 0
for i, mat in enumerate(all_mats):
    num_vessels_lst[i] = mat['names'].shape[1]
    mat_name = mat_names[i]
    
    m_rICT[i] = mat['rICT']
    # Need to find what the longest rICT vector is
    if m_rICT[i].shape[0] > running_max:
        running_max = m_rICT[i].shape[0]
    #m_ROI[i] = mat['ROI']  # I don't think I actually need this, for now at least
    m_t = mat['t']
    t[i] = m_t.reshape((m_t.shape[1]))
    
    y_train_reg = np.load(os.path.join('Labels', mat_name + '_reg.npy'))
    # Do I still need this? ... No but would have to refactor other parts of the code
    labels[i] = y_train_reg

In [3]:
print(y_train_reg.shape)

(7,)


In [4]:
rict_df = pd.DataFrame()
reg_labels_npy = np.array([])

# Create the rict_df of input, and the labels_df 
for i in range(len(num_vessels_lst)):
    # First, zero pad to reach max vector length
    if running_max - m_rICT[i].shape[0] > 0:
        zp_mat = np.zeros(((running_max - m_rICT[i].shape[0]), num_vessels_lst[i]))
        zp_rict = np.concatenate((m_rICT[i], zp_mat))
        
        zp_class = np.concatenate((np.reshape(labels[1], (labels[1].shape[0], 1)), np.transpose(zp_mat)), axis=1)
    else:
        zp_rict = m_rICT[i]
        zp_class = labels[1]
        
    # Now safely append to dataframe
    rict_df = pd.concat((rict_df, pd.DataFrame(np.transpose(zp_rict)))) #, axis=1
    # Labels
    reg_labels_npy = np.concatenate((reg_labels_npy, labels[i]))
    
    print(f"{i}: delta t is {(t[i][25] - t[i][0])/25}")
    
rict_df.reset_index(inplace=True, drop=True)

0: delta t is 0.14110399999998663
1: delta t is 0.00736


In [5]:
print(rict_df.shape)
rict_df.head()

(14, 4000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
0,1.061179,1.036253,1.073106,0.970642,0.957931,0.929883,0.968243,0.960815,0.965519,1.005626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.017361,0.968635,1.013525,0.959467,0.932092,0.962371,0.985138,0.994388,0.998228,1.001761,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.114077,1.059771,1.07842,0.993976,0.951927,0.97959,1.005449,1.010853,1.028106,1.053987,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.096296,1.021505,1.078318,0.988432,0.953285,0.96124,0.990814,0.992204,1.000505,1.047153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.147843,1.105764,1.058356,1.011027,0.93599,0.957186,0.957919,0.988637,0.96237,1.0265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Make training df

In [6]:
x_train = rict_df.copy(deep=True).transpose()
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.061179,1.017361,1.114077,1.096296,1.147843,1.121544,1.10988,1.104496,1.065481,1.096309,1.066983,1.033309,1.0509,1.02068
1,1.036253,0.968635,1.059771,1.021505,1.105764,1.083047,1.038855,1.070832,1.069529,1.11627,1.065764,0.971102,1.073872,1.043329
2,1.073106,1.013525,1.07842,1.078318,1.058356,1.08118,1.069371,1.021953,1.009902,0.998641,1.011728,0.99218,1.027361,1.014699
3,0.970642,0.959467,0.993976,0.988432,1.011027,1.045294,0.96151,1.018994,0.991405,1.013847,1.043583,0.970549,1.035323,1.03549
4,0.957931,0.932092,0.951927,0.953285,0.93599,0.973522,0.945293,1.026853,1.005429,1.053537,1.05256,1.00033,1.069433,1.061703


## Make train-test split


In [7]:
x_train_t = np.transpose(x_train)
y_train_reg = reg_labels_npy

# Number of decimal points
num_dps = 3

acc_log = dict()

In [8]:
# Need more data before I can actually do this

from sklearn.model_selection import train_test_split

## TRAIN / TEST
# Stratify might be good to ensure that all classes are represented, I'm not sure if it'll do that by defualt
X_train, X_test, y_train, y_test = train_test_split(
    x_train_t, y_train_reg, test_size=0.3, random_state=42)

## TRAIN / VAL
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42)

# NOW UPDATE THE REST OF THE CODE TO REFLECT THIS

## ML Modeling

In [9]:
# Machine learning
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

In [10]:
def fit_ml_algo(algo, x_train, y_train, cv):
    '''Runs given algorithm and returns the accuracy metrics'''
    
    model = algo.fit(x_train, y_train)
    acc = round(model.score(x_train, y_train) * 100, 2)
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, 
                                                  x_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

Some things you could do, not sure if it's worth it / beneficial

In [11]:
#from sklearn.preprocessing import StandardScaler
#scalerX = StandardScaler().fit(X_train)
#scalery = StandardScaler().fit(y_train)

#X_train = scalerX.transform(X_train)
#y_train = scalery.transform(y_train)
#X_test = scalerX.transform(X_test)
#y_test = scalery.transform(y_test)

## Regression
> https://medium.com/analytics-vidhya/5-regression-algorithms-you-need-to-know-theory-implementation-37993382122d
1. Linear Regression
2. Neural Network Regression --> Use a linear activation function on the last layer
3. Decision Tree Regression
4. LASSO Regression --> Good for data that shows heavy multicollinearity (heavy correlation of features with each other)
5. Rdige Regression --> Also good for datasets that have an abundant amount of featuesr which are not indepdent (collinearity) from one another
6. ElasticNet Regression
> https://www.jigsawacademy.com/popular-regression-algorithms-ml/
1. Random Forest
2. SVM
3. Gaussian Regression
4. Polynomial Regression
> https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
- Metrics we care about ^^
> https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca
- Could do this to encode the time series with the data...

## Decision Tree
> Should also try gradient boosting these
 - Decision trees tend to overfit on data with a large number of features. Getting the right ratio of samples to number of features is important, since a tree with few samples in high dimensional space is very likely to overfit.
 - Visualize your tree as you are training by using the export function. Use max_depth=3 as an initial tree depth to get a feel for how the tree is fitting to your data, and then increase the depth.

In [12]:
# https://scikit-learn.org/stable/modules/tree.html

from sklearn import tree

clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
model_acc = round(clf.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["Decision Tree"] = model_acc

ML Predictions:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 100.0


## Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["Random Forest"] = model_acc

ML Predictions:
[234.52   0.     3.43   2.63  23.21   5.26   0.   268.69   6.86]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 95.23


## LASSO Regression

In [14]:
from sklearn.linear_model import LassoCV
model = LassoCV()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["LASSO"] = model_acc

ML Predictions:
[ 2.62618930e+02  1.43827135e+01  4.44118564e+01  3.34240140e+01
  3.45940254e+01 -1.59049589e+01  1.62432751e-01  2.34177811e+02
 -1.86682403e+00]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 88.637


## Ridge Regression

In [15]:
from sklearn.linear_model import RidgeCV
model = RidgeCV()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["Ridge"] = model_acc

ML Predictions:
[295.12321349   9.34227956   5.10550917  23.80465993  58.79141652
 -16.74191385 -19.34138687 257.61982448  -7.70360244]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 90.979


## SVR
 - Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data. For example, scale each attribute on the input vector X to [0,1] or [-1,+1], or standardize it to have mean 0 and variance 1. Note that the same scaling must be applied to the test vector to obtain meaningful results. This can be done easily by using a Pipeline:

In [31]:
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
# Can vary kernel types, degrees, a few other boolean heuristics

# Example
from sklearn.svm import SVR

# Default kernel (rbf) is trash and gives negative accuracy...
model = SVR(kernel='linear')
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["Unscaled SVR"] = model_acc

ML Predictions:
[ 2.62899905e+02  6.31339663e+00 -1.00104373e-01  2.80410491e+01
  6.20746551e+01 -4.33190214e+00  1.00004206e-01  1.86873209e+02
  1.00325637e-01]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 80.088


In [32]:
# Scaling here
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2, kernel='linear'))
regr.fit(X_train, y_train)
train_pred = regr.predict(X_train)
model_acc = round(regr.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["StandardScaler SVR"] = model_acc

ML Predictions:
[ 2.63199961e+02  1.99673919e-01  1.99594743e-01  2.00131431e-01
  1.99748822e-01 -1.99374933e-01 -1.99334112e-01  3.42800127e+02
 -2.00409471e-01]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 100.0


Poly, rbf, sigmoid kernels all have negative accuracy.  Precomputed requires other things to run.

## Gaussian Process Regressor

In [19]:
#https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html

from sklearn.datasets import make_friedman2
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

#X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0).fit(X_train, y_train)
gpr.score(X_train, y_train)

-0.025626513253798278

In [20]:
gpr.predict(X_train, return_std=True)

(array([44.53574221, 44.11849288, 44.1791453 , 42.63530398, 44.02534796,
        42.45209289, 41.95527614, 44.85606303, 43.84890574]),
 array([140.42385404, 140.89889304, 141.0084064 , 142.42194456,
        140.5486791 , 143.39458686, 143.39339661, 140.42315312,
        140.87141146]))

## KNRegressor

In [21]:
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=1)
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["K-Nearest Regressor (1)"] = model_acc

ML Predictions:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 100.0


In [22]:
# 2 is the default

model = KNeighborsRegressor(n_neighbors=2)
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["K-Nearest Regressor (2)"] = model_acc

ML Predictions:
[303.    0.    0.    0.    0.    0.    0.  171.5   0. ]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 78.761


In [23]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log["K-Nearest Regressor (3)"] = model_acc

ML Predictions:
[202.   0.   0.   0.   0.   0.   0. 202.   0.]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 83.836


In [24]:
max_neighbors = X_train.shape[0]

model = KNeighborsRegressor(n_neighbors=max_neighbors)
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
model_acc = round(model.score(X_train, y_train) * 100, num_dps)

print("ML Predictions:")
print(train_pred)
print("\nGround Truth:")
print(y_train)
print(f"\nAccuracy: {model_acc}")
acc_log[f"K-Nearest Regressor ({max_neighbors})"] = model_acc

ML Predictions:
[67.33333333 67.33333333 67.33333333 67.33333333 67.33333333 67.33333333
 67.33333333 67.33333333 67.33333333]

Ground Truth:
[263.   0.   0.   0.   0.   0.   0. 343.   0.]

Accuracy: 0.0


## ElasticNet Regression
>Does not converge, do not run

In [25]:
#from sklearn.linear_model import ElasticNetCV
#model = ElasticNetCV()
#model.fit(X_train, y_train)
#train_pred = model.predict(X_train)
#model_acc = round(model.score(X_train, y_train) * 100, num_dps)

#print("ML Predictions:")
#print(train_pred)
#print("\nGround Truth:")
#print(y_train)
#print(f"\nAccuracy: {round(model.score(X_train, y_train) * 100, 2)}")

# Accuracy Evaluation

In [28]:
acc_df = pd.DataFrame.from_dict(acc_log, orient='index', columns=['Accuracy'])
acc_df.reset_index(inplace=True)
acc_df.rename(columns={'index': 'Algorithm'}, inplace=True)
acc_df.head(100)

Unnamed: 0,Algorithm,Accuracy
0,Random Forest,95.23
1,LASSO,88.637
2,Ridge,90.979
3,K-Nearest Regressor (1),100.0
4,K-Nearest Regressor (2),78.761
5,K-Nearest Regressor (3),83.836
6,K-Nearest Regressor (9),0.0
