

# Crossvalidation and hyperparameter selection

In [1]:

import pandas as pd
import numpy as np
from sklearn import tree 
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes

# Calculate Generalized Error on Linear Regression with k-fold Cross Validation

In [2]:
df_X, s_y = load_diabetes(return_X_y = True, as_frame = True)
print(df_X.columns)

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')


## create a linear least squares regression model 


In [3]:
def get_linear_regression_model( df_X, s_y ):
    X = df_X.values
    X = np.insert(X,0,1,axis = 1)
    X = pd.DataFrame(X)
    beta_hat = np.linalg.lstsq(X,s_y)
    beta_hat = beta_hat[0]
    return beta_hat

In [4]:
# code to check beta_hat
np.random.seed(23)
beta_hat = get_linear_regression_model( pd.DataFrame(np.random.random((34,4))), pd.Series(np.random.random(34)*10.0) )
beta_hat

  beta_hat = np.linalg.lstsq(X,s_y)


array([ 4.18818425,  1.77890808,  0.74032569, -1.3506416 ,  0.14535984])

##  partition the dataframe and series data into dictionaries


In [5]:

def partition_data( df_X, s_y, k ):
    # your code here
    dict_x = {}
    dict_y = {}

    column = df_X.shape[0]
    indices = np.arange(column)
    np.random.shuffle(indices)
    
    df_X = df_X[indices]
    s_y = s_y[indices]
    fold_x = np.array_split(df_X,k)
    fold_y = np.array_split(s_y,k)
    
    for i in range(k):
        dict_x[i] = fold_x[i]
        dict_y[i] = fold_y[i]
    
    return (dict_x, dict_y)


In [6]:
df_X = df_X.to_numpy()
s_y = s_y.to_numpy()
(dict_k_df_X, dict_k_s_y) = partition_data( df_X, s_y, 5 )

In [7]:
# Check fold sizes
length = len(dict_k_df_X.values())
for x in range(length):
    print("Fold length of dataframe is " + str(len(dict_k_df_X[x])) + " and length of series is " + str(len(dict_k_s_y[x])))


Fold length of dataframe is 89 and length of series is 89
Fold length of dataframe is 89 and length of series is 89
Fold length of dataframe is 88 and length of series is 88
Fold length of dataframe is 88 and length of series is 88
Fold length of dataframe is 88 and length of series is 88


## calculate a regression metric



In [8]:
def get_mae( s_y, s_y_hat):
    # your code here
    n = len(s_y)
    MAE = 0
    for i in range(n):
        temp = abs(s_y[i] - s_y_hat[i]) / n
        MAE += temp
    return MAE


In [9]:
# Test it 
x = np.array([1,2,3])
y = np.array([2,2,3])
get_mae(x,y)

0.3333333333333333

## Calculate the $MAE$ for each fold


In [10]:
mae = np.array([])
for k in dict_k_df_X.keys():

    #set the current key as the test data
    outer_fold_x = dict_k_df_X[k]
    outer_fold_y = dict_k_s_y[k]
    
    #set the folds in each other key as the train data
    for j in dict_k_df_X.keys():
        if j != k:
            inner_fold_x = dict_k_df_X[j]
            inner_fold_y = dict_k_s_y[j]
            #print(inner_fold_x)
            #pd.DataFrame(inner_fold_x)
            for i in dict_k_df_X.keys():
                if (i != j and i !=k):

                    inner_fold_x = np.append(inner_fold_x, dict_k_df_X[i], axis = 0)
                    inner_fold_y = np.append(inner_fold_y, dict_k_s_y[i], axis = 0)
    
    #calculate beta hat, create matrix, then multiply for s_y_hat
    inner_fold_x = pd.DataFrame(inner_fold_x)
    inner_fold_y = pd.DataFrame(inner_fold_y)
    beta_hat = get_linear_regression_model(inner_fold_x, inner_fold_y)
    X = dict_k_df_X[k]
    X = np.insert(X, 0, 1, axis = 1)

    s_y_hat = np.matmul(X, beta_hat)
    
    mae = np.append( mae, get_mae(outer_fold_y,s_y_hat) )  

  beta_hat = np.linalg.lstsq(X,s_y)


In [11]:
print("The min MAE is {:.2f}, the max MAE is {:.2f}, and the mean MAE is {:.2f}".format(mae.min(),mae.max(),mae.mean()))

The min MAE is 41.87, the max MAE is 47.03, and the mean MAE is 44.40


# Find the best hyperparameter to use in a Decision Tree 

In [12]:
df_X, s_y = load_iris(return_X_y = True, as_frame=True)
print(len(df_X))

150


## Partition `df_X` and `s_y` 

In [13]:
df_X = df_X.to_numpy()
s_y = s_y.to_numpy()
(dict_k_df_X, dict_k_s_y) = partition_data(df_X, s_y, 5)
print("Rows in X: ")
for x in dict_k_df_X.values():
    print(len(x))
print("Rows in Y:")
for y in dict_k_s_y.values():
    print(len(y))
print("Data Points")
points = 0
for i in df_X:
    for j in i:
        points += 1
points

Rows in X: 
30
30
30
30
30
Rows in Y:
30
30
30
30
30
Data Points


600

## Define accuracy


In [14]:
def get_acc( s_1, s_2 ):
    # your code here
    s_1 = s_1.tolist()
    s_2 = s_2.tolist()
    
    length = len(s_1)
    acc = 0
    for i in range(length):
        if s_1[i] == s_2[i]:
            acc += 1
    return (acc/length)

In [15]:
get_acc(s_y,np.ones(len(s_y)))

0.3333333333333333

## Using Nested Cross validation, find the best hyperparameter


In [16]:
possible_min_impurity_decrease = np.array([0.1,0.25,0.3,0.4])

# Outer loop
outer_accuracy = np.array([])
for k in dict_k_df_X.keys():

    # your code here
    outer_x = dict_k_df_X[k]
    outer_y = dict_k_s_y[k]
    
    for j in dict_k_df_X.keys():
        if j != k:
            inner_x = dict_k_df_X[j]
            inner_y = dict_k_s_y[j]
            for i in dict_k_df_X.keys():
                if (i != j and i != k):
                    inner_x= np.append(inner_x, dict_k_df_X[i], axis = 0)
                    inner_y= np.append(inner_y, dict_k_s_y[i], axis = 0) 

    train_x, train_y = partition_data(inner_x, inner_y , 4)
    min_accuracy = 0
    min_impurity = 0
    # Inner loop cross validation code here (use 4 folds, where the fold does not include k)
    for pos_min_impurity in possible_min_impurity_decrease:
        print(pos_min_impurity)
        model = tree.DecisionTreeClassifier(criterion='gini', min_impurity_decrease = pos_min_impurity)
        accuracy_list = []
        for k in train_x.keys():
            test_x = train_x[k]
            test_y = train_y[k]
            for j in train_x.keys():
                if j != k:
                    train_array_X = train_x[j]
                    train_array_Y = train_y[j]
                    
                    for i in train_x.keys():
                        if (i != j and i !=k):
                            train_array_X= np.append(train_array_X, train_x[j], axis = 0)
                            train_array_Y= np.append(train_array_Y, train_y[j], axis = 0)

            model.fit(train_array_X, train_array_Y)
            predicted_y = model.predict(test_x)
            accuracy_list.append(get_acc(test_y,predicted_y))
        accuracy_avg = sum(accuracy_list)/len(accuracy_list)
        # Use best min impurity decrease to train model

        print("Average accuracy over 4 folds is:", accuracy_avg, 2)
        
        if accuracy_avg > min_accuracy:
            min_accuracy = accuracy_avg
            min_impurity = pos_min_impurity
            
    
    # outer accuracy calculation 
    o_tree = tree.DecisionTreeClassifier(criterion='gini', min_impurity_decrease = min_impurity)
    o_tree.fit(inner_x, inner_y)
    predict_out = o_tree.predict(outer_x)
    o_accurracy = get_acc(outer_y, predict_out)

    outer_accuracy = np.append(outer_accuracy,o_accurracy) # make sure and calculate this_acc in your loop

0.1
Average accuracy over 4 folds is: 0.9416666666666667 2
0.25
Average accuracy over 4 folds is: 0.9416666666666667 2
0.3
Average accuracy over 4 folds is: 0.9416666666666667 2
0.4
Average accuracy over 4 folds is: 0.325 2
0.1
Average accuracy over 4 folds is: 0.9583333333333334 2
0.25
Average accuracy over 4 folds is: 0.875 2
0.3
Average accuracy over 4 folds is: 0.875 2
0.4
Average accuracy over 4 folds is: 0.3416666666666667 2
0.1
Average accuracy over 4 folds is: 0.95 2
0.25
Average accuracy over 4 folds is: 0.95 2
0.3
Average accuracy over 4 folds is: 0.6083333333333333 2
0.4
Average accuracy over 4 folds is: 0.2916666666666667 2
0.1
Average accuracy over 4 folds is: 0.9416666666666667 2
0.25
Average accuracy over 4 folds is: 0.85 2
0.3
Average accuracy over 4 folds is: 0.4833333333333333 2
0.4
Average accuracy over 4 folds is: 0.25 2
0.1
Average accuracy over 4 folds is: 0.9500000000000001 2
0.25
Average accuracy over 4 folds is: 0.6916666666666667 2
0.3
Average accuracy over 4 

## Show the generalized performance of the classifier 


In [17]:
print("Minimum accuracy of outer fold:", min(outer_accuracy))
print("Maximum accuracy of outer fold:", max(outer_accuracy))
print("Mean accuracy of outer fold:", sum(outer_accuracy)/len(outer_accuracy))

Minimum accuracy of outer fold: 0.8666666666666667
Maximum accuracy of outer fold: 0.9666666666666667
Mean accuracy of outer fold: 0.9333333333333333
