In [52]:
import os
import random
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix
from scipy.stats import spearmanr
import itertools
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
import shap 
import seaborn as sns
from sklearn.linear_model import LinearRegression
sns.set_style('darkgrid')

# Define file paths

In [53]:
normalize = False

MEASURES_FILE = "MdRQA_measures.csv"
SHUFF_MEASURES_FILE = "shuff_MdRQA_measures.csv"
LABELS_FILE = "team_block_outcomes.csv"
RESULTS = "results/" # "results_no_RR/" 

if normalize:
    TASK_SCORE_RESULTS = RESULTS + "task_score_norm/"
else:
    TASK_SCORE_RESULTS = RESULTS + "task_score/"

SUBJ_OUTCOME_RESULTS = RESULTS + "subjective_outcome/"
VALENCE_RESULTS = RESULTS + "valence/"

# Load data

In [54]:
dfMeasures = pd.read_csv(MEASURES_FILE)
dfShuffMeasures = pd.read_csv(SHUFF_MEASURES_FILE)
dfLabels = pd.read_csv(LABELS_FILE)

print("%s shape: %s" % (MEASURES_FILE, dfMeasures.shape))
print("%s shape: %s" % (SHUFF_MEASURES_FILE, dfShuffMeasures.shape))
print("%s shape: %s" % (LABELS_FILE, dfLabels.shape))


MdRQA_measures.csv shape: (271, 11)
shuff_MdRQA_measures.csv shape: (271, 11)
team_block_outcomes.csv shape: (271, 7)


In [55]:
features = ['REC', 'DET', 'ADL', 'MDL', 'DENTR', 'LAM', 'AVL', 'MVL', 'VENTR'] #['REC', 'DET', 'ADL', 'MDL', 'DENTR', 'LAM', 'AVL', 'MVL', 'VENTR']


def get_group_data(dfData, GROUPID_list, features_list, label_names):
    dfGroupsData = pd.DataFrame()    
    for GROUPID in GROUPID_list:
        dfGroupsData = pd.concat([dfGroupsData, dfData.loc[dfData['GROUPID'] == GROUPID, :]], ignore_index=True)
        
    data = dfGroupsData.loc[:, features_list]  
    labels = dfGroupsData.loc[:, label_names]

    return [data, labels, dfGroupsData]

def min_max_scaling(series):
    # https://datagy.io/pandas-normalize-column/
    return (series - series.min()) / (series.max() - series.min())


# Experiment Set Up

In [56]:
# Get model type
shuffled = False
chance = False
num_iters = 25
num_folds = 10
model = "RFR"

if shuffled:
    dfData = pd.merge(dfShuffMeasures, dfLabels, on=['GROUPID', 'block'], how='inner')
    # Drop rows with NaN for ADL or AVL
    dfData = dfData.dropna()
elif chance:
    dfData = pd.merge(dfMeasures, dfLabels, on=['GROUPID', 'block'], how='inner')
    # Shuffle labels
    label_cols = ['CPS_and_ITN_mean', 'Valence', 'num_gold', 'num_silver', 'task_score']
    dfData.loc[:, label_cols] = shuffle(dfData.loc[:, label_cols], random_state=12).reset_index(drop=True)
else:
    dfData = pd.merge(dfMeasures, dfLabels, on=['GROUPID', 'block'], how='inner')

# Add on binary task score
median = dfData['task_score'].median()
dfData['task_score_bin'] = np.where(dfData['task_score'] <= median, 0, 1)

# Add on normalized task score
dfData['norm_task_score'] = min_max_scaling(dfData['task_score'])


print("dfData shape: ", dfData.shape)
display(dfData)


dfData shape:  (271, 18)


Unnamed: 0,GROUPID,block,REC,DET,ADL,MDL,DENTR,LAM,AVL,MVL,VENTR,CPS_and_ITN_mean,Valence,num_gold,num_silver,task_score,task_score_bin,norm_task_score
0,1010,ExpBlock1,0.111890,46.765002,2.580148,12,8.031479,63.476117,3.202765,8,8.276899,-0.304697,3.666667,1,1,3,1,0.200000
1,1010,ExpBlock2,0.221818,51.941830,2.801763,13,8.693110,71.437617,3.567544,14,8.801583,-0.304697,3.666667,2,1,5,1,0.333333
2,10100,ExpBlock1,0.469005,68.603876,3.236574,27,9.502080,83.376781,4.480822,28,9.315484,0.486795,4.000000,0,0,0,0,0.000000
3,10100,ExpBlock2,0.328632,63.319890,3.373809,34,8.995863,78.390610,4.336822,35,8.899755,0.545526,4.666667,0,3,3,1,0.200000
4,10102,ExpBlock1,0.275806,40.010880,2.781595,16,8.650819,60.603101,3.351069,15,8.895810,0.454631,4.333333,3,2,8,1,0.533333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,2070,ExpBlock2,0.128570,35.135661,2.392715,9,7.967704,59.342180,2.886733,9,8.447403,0.302205,4.000000,0,4,4,1,0.266667
267,2070,Warmup,0.411115,49.651775,2.839304,42,9.178902,70.710415,3.726037,43,9.143787,0.489594,4.000000,3,0,6,1,0.400000
268,2071,ExpBlock1,0.291773,45.826191,2.544979,11,8.971522,68.343464,3.244426,12,9.145826,0.032318,2.333333,0,0,0,0,0.000000
269,2071,ExpBlock2,0.188310,46.079278,2.572276,16,8.522221,66.710627,3.169010,17,8.746838,0.117616,4.333333,1,4,6,1,0.400000


## Prep for team-level cross validation

In [57]:
# Define fold names
train_folds = []
test_folds = []
set_type = "test"
for j in range(1,num_folds+1): 
    col_name = "Fold" + str(j) + "_" + set_type
    test_folds.append(col_name)
    set_type = "train"  
    col_name = "Fold" + str(j) + "_" + set_type
    train_folds.append(col_name)
    set_type = "test"

folds_dict_list = []

# Split teams into 5 groups # TODO: Need to use dfLabels to match the same folds as the CNN models
teams = pd.unique(dfMeasures.GROUPID)

# For every iteration
for i in range(1,num_iters+1):
    print("Iteration: ", i)
    teams = shuffle(teams, random_state=i)
    groups = np.array_split(teams, num_folds)
    
    # Define groups for each fold
    fold_groups = {}
    for j, (train_fold, test_fold) in enumerate(zip(train_folds, test_folds)):
        # make the current group the test group
        fold_groups[test_fold] = groups[j]
        # make all other groups the train group
        train_group = groups[:j] + groups[j+1:]
        train_group = [team for group in train_group for team in group]
        fold_groups[train_fold] = train_group
        
    ## Confirm that for each fold, there is no team overlap bewteen train and test set
    for j in range(1,num_folds+1):
        assert set(fold_groups['Fold'+str(j)+'_test']).isdisjoint(set(fold_groups['Fold'+str(j)+'_train'])), "There is overlap in train and test set " + str(j)
    
    print("* No team overlap *")

      
    # Add fold groups to dictionary
    folds_dict_list.append(fold_groups)
    

# Informational
print("\nNumber of iterations: ", len(folds_dict_list))

print("\nIterating through folds_dict_list to check for overlap...")
for i,dicti in enumerate(folds_dict_list):
    for j in range(1,num_folds+1):
        assert set(dicti['Fold'+str(j)+'_test']).isdisjoint(set(dicti['Fold'+str(j)+'_train'])), "There is overlap in train and test set " + str(j)
    
print("* No team overlap *")  



Iteration:  1
* No team overlap *
Iteration:  2
* No team overlap *
Iteration:  3
* No team overlap *
Iteration:  4
* No team overlap *
Iteration:  5
* No team overlap *
Iteration:  6
* No team overlap *
Iteration:  7
* No team overlap *
Iteration:  8
* No team overlap *
Iteration:  9
* No team overlap *
Iteration:  10
* No team overlap *
Iteration:  11
* No team overlap *
Iteration:  12
* No team overlap *
Iteration:  13
* No team overlap *
Iteration:  14
* No team overlap *
Iteration:  15
* No team overlap *
Iteration:  16
* No team overlap *
Iteration:  17
* No team overlap *
Iteration:  18
* No team overlap *
Iteration:  19
* No team overlap *
Iteration:  20
* No team overlap *
Iteration:  21
* No team overlap *
Iteration:  22
* No team overlap *
Iteration:  23
* No team overlap *
Iteration:  24
* No team overlap *
Iteration:  25
* No team overlap *

Number of iterations:  25

Iterating through folds_dict_list to check for overlap...
* No team overlap *


# Regression

## Predict Task Score

In [58]:
# Resources
## https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

# Store metrics for all iterations
maes = []    # MAE (mean absolute errors)
mses = []    # MSE (mean squared errors)
rmses = []   # RMSE (root mean squared errors)
corrs = []   # spearman correlations
ps = []      # spearman correlation p-values

all_y_test_task_score = [[] for i in range(num_iters)]
predictions_task_score = [[] for i in range(num_iters)]
predict_proba_task_score = [[] for i in range(num_iters)]

# # For storing shap values across all folds
# task_score_shap_values_0 = None
# task_score_shap_values_1 = None
# task_score_full_X_test = pd.DataFrame()

#-----------------------------------------------#
#      5-fold team level cross-validation       #
#-----------------------------------------------#

# For each iteration 
for i in range(num_iters):
    print("Iteration: ", i+1)
    
    # Lists for cumulative test set and predictions for iteration
    dfFullTest = pd.DataFrame()
    all_y_test = []
    predictions = []
    imp_list = []
    
    # Create model for task_score prediction
    if model == "RFR":
        model_task_score = RandomForestRegressor(n_estimators=100, random_state=1, max_features='sqrt') 

    elif model == "SVR":
        model_task_score = SVR()
        
    elif model == "LR":
        model_task_score = LinearRegression()
    
    
    # Get fold groups
    fold_groups = folds_dict_list[i]
    
    # For each fold
    for j, (test_fold, train_fold) in enumerate(zip(test_folds, train_folds)):
#         print("\tFold: ", j+1)
        # Get data for teams in test set
        if normalize:
            test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'norm_task_score')
        else:
            test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'task_score')
        X_test = test_data_list[0]
        y_test = test_data_list[1]
        all_y_test.extend(y_test.tolist())

        dfFullTest = pd.concat([dfFullTest, test_data_list[2]], ignore_index=True)      
        
        # Get data for teams in train set
        if normalize:
            train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'norm_task_score')
        else:
            train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'task_score')
        X_train = train_data_list[0]
        y_train = train_data_list[1]

        # Train model
        model_task_score.fit(X_train, y_train)

        # Test model
        y_pred = model_task_score.predict(X_test)
        predictions.extend(y_pred.tolist())
        
#         ## Get SHAP values
#         explainer = shap.TreeExplainer(model_task_scorerfc_task_score)
#         shap_values = explainer.shap_values(X_test)
        
#         if j==0:
#             task_score_shap_values_0 = shap_values[0]
#             task_score_shap_values_1 = shap_values[1]
#         else:
#             task_score_shap_values_0 = np.vstack([task_score_shap_values_0, shap_values[0]])
#             task_score_shap_values_1 = np.vstack([task_score_shap_values_1, shap_values[1]])
#         task_score_full_X_test = pd.concat([task_score_full_X_test, X_test], ignore_index=True)
#         ## End of get SHAP values

        if model == "RFR":    
            ### Compute Feature Importances for last fold iteration ###
            # Impurity-based importances
            importances = model_task_score.feature_importances_
            std = np.std([tree.feature_importances_ for tree in model_task_score.estimators_], axis=0)
            imps = pd.Series(importances, index=features)
            stds = pd.Series(std, index=features)
            imp_list.append(imps)
#             dfIterFeatureImportances = pd.concat([dfIterFeatureImportances, imps])
            
            
#             ## Plot feature importances
#             fig1, ax1 = plt.subplots()
#             imps.plot.bar(yerr=std, ax=ax1)
#             ax1.set_title("Task Score Feature importances using MDI")
#             ax1.set_ylabel("Mean decrease in impurity")
#             plt.show()
#             ## END plotting feature importances

# ----- END OF FOLDS

    # Save feature importances for iteration
    dfIterFeatureImportances = pd.DataFrame(imp_list, columns=features)

    if shuffled:
        dfIterFeatureImportances.to_csv(TASK_SCORE_RESULTS + "RAW_SHUFFLED/" + model + "/feature_importances/" + model + "_feature_importances_SHUFF_" + str(i+1) + ".csv", index=False)
    elif chance:
        dfIterFeatureImportances.to_csv(TASK_SCORE_RESULTS + "RAW_CHANCE/" + model + "/feature_importances/" + model + "_feature_importances_CHANCE_" + str(i+1) + ".csv", index=False)
    else:
        dfIterFeatureImportances.to_csv(TASK_SCORE_RESULTS + "RAW/" + model + "/feature_importances/" + model + "_feature_importances_" + str(i+1) + ".csv", index=False)
    
    
    # Save metrics for iteration
    all_y_test = np.array(all_y_test)
    all_y_test_task_score[i] = all_y_test

    predictions = np.array(predictions)
    predictions_task_score[i] = predictions

    # Calculate the absolute errors (MAE) of the iteration
    mae = metrics.mean_absolute_error(all_y_test, predictions)
    mse = metrics.mean_squared_error(all_y_test, predictions)
    rmse = np.sqrt(metrics.mean_squared_error(all_y_test, predictions))
    corr, p = spearmanr(all_y_test, predictions)
    maes.append(mae)
    mses.append(mse)
    rmses.append(rmse)
    corrs.append(corr)
    ps.append(p)

     # Save actual labels and predictions for iteration
    dfTruevPred = dfFullTest.loc[:, ['GROUPID', 'block', 'task_score']]
    dfTruevPred['prediction'] = predictions
    dfTruevPred.sort_values(['GROUPID', 'block', 'task_score'], ignore_index=True, inplace=True)

    dfTruevPred['error'] = abs(dfTruevPred['prediction'] - dfTruevPred['task_score'])
#     print("MAE from df: ", round(np.mean(dfTruevPred['error']), 2))
#     # FOR PLOTTING ACTUAL vs. PREDICTED   
#     dfTruevPred.plot(y=['task_score', 'prediction'], title='Actual vs. Predicted Task Score', \
#                      style=['b-', 'ro'], figsize=(20, 5))
#     # END PLOTTING
        
    
#     if shuffled:
#         dfTruevPred.to_csv(TASK_SCORE_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_TaskScore_True_vs_Pred_SHUFF_" + str(i+1) + ".csv", index=False)
#     elif chance:
#         dfTruevPred.to_csv(TASK_SCORE_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_TaskScore_True_vs_Pred_CHANCE_" + str(i+1) + ".csv", index=False)
#     else:
#         dfTruevPred.to_csv(TASK_SCORE_RESULTS + "RAW/" + model + "/" + model + "_TaskScore_True_vs_Pred_" + str(i+1) + ".csv", index=False)
    
    
# ----- END OF ITERATIONS

print("\n =========== ALL ITERATIONS RESULTS SUMMARY ===========")
dfMetrics = pd.DataFrame({'iteration': [i for i in range(1,num_iters+1)], \
                          'mae': maes, 'mse': mses, 'rmse': rmses, 'corrs': corrs, 'ps': ps})

display(dfMetrics)

# if shuffled:
#     dfMetrics.to_csv(TASK_SCORE_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_TaskScore_Metrics_SHUFF.csv", index=False)
# elif chance:
#     dfMetrics.to_csv(TASK_SCORE_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_TaskScore_Metrics_CHANCE.csv", index=False)
# else:
#     dfMetrics.to_csv(TASK_SCORE_RESULTS + "RAW/" + model + "/" + model + "_TaskScore_Metrics.csv", index=False)


print("Average over all iterations:")
print("%6s %.2f" % ("MAE:", np.mean(dfMetrics['mae'])))
print("%6s %.2f" % ("MSE:", np.mean(dfMetrics['mse'])))
print("%6s %.2f" % ("RMSE:", np.mean(dfMetrics['rmse'])))
print("%6s %.2f" % ("Corr:", np.mean(dfMetrics['corrs'])))
print("%6s %.7f" % ("p-val:", np.mean(dfMetrics['ps'])))

# Get median iterations
med_corr_idx = np.argsort(corrs)[len(corrs)//2]
print("\nMedian iteration number: ", med_corr_idx+1)



Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25



Unnamed: 0,iteration,mae,mse,rmse,corrs,ps
0,1,2.579742,11.066096,3.326574,0.334312,1.689014e-08
1,2,2.614945,11.344875,3.368215,0.324717,4.506974e-08
2,3,2.625166,11.252868,3.354529,0.320615,6.786916e-08
3,4,2.558339,10.993112,3.315586,0.347191,4.285814e-09
4,5,2.603321,11.406511,3.377353,0.343481,6.40351e-09
5,6,2.621476,11.433815,3.381393,0.320041,7.183319e-08
6,7,2.582214,11.129983,3.336163,0.346721,4.510882e-09
7,8,2.626494,11.6977,3.42019,0.310086,1.887857e-07
8,9,2.617085,11.408055,3.377581,0.326068,3.933127e-08
9,10,2.611587,11.476111,3.387641,0.31292,1.439053e-07


Average over all iterations:
  MAE: 2.61
  MSE: 11.29
 RMSE: 3.36
 Corr: 0.33
p-val: 0.0000001

Median iteration number:  16


## (LR ONLY) Get linear regression coefficients for the median model

In [35]:
# TODO
if model == "LR":
    
    i = med_corr_idx # for iteration number 1 (median iteration)
    print("Median iteration number: ", i+1)
    
    # Store metrics for all iterations
    maes = []    # MAE (mean absolute errors)
    mses = []    # MSE (mean squared errors)
    rmses = []   # RMSE (root mean squared errors)
    corrs = []   # spearman correlations
    ps = []      # spearman correlation p-values
    
    # Get fold groups
    fold_groups = folds_dict_list[i]
    
    # For each fold
    for j, (test_fold, train_fold) in enumerate(zip(test_folds, train_folds)):
        print("\tFold: ", j+1)
        # Get data for teams in test set
        if normalize:
            test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'norm_task_score')
        else:
            test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'task_score')
        X_test = test_data_list[0]
        y_test = test_data_list[1]      
        
        # Get data for teams in train set
        if normalize:
            train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'norm_task_score')
        else:
            train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'task_score')
        X_train = train_data_list[0]
        y_train = train_data_list[1]

        # Train model
        model_task_score.fit(X_train, y_train)

#         coeff_df = pd.DataFrame(model_task_score.coef_, X_train.columns, columns=['Coefficient'])
#         display(coeff_df)
        
        # Test model
        y_pred = model_task_score.predict(X_test)
        
        # Calculate evaluation metrics of the fold
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
        corr, p = spearmanr(y_test, y_pred)
        maes.append(mae)
        mses.append(mse)
        rmses.append(rmse)
        corrs.append(corr)
        ps.append(p)
        
    
    dfMetrics = pd.DataFrame({'fold': [i for i in range(1,len(test_folds)+1)], \
                          'mae': maes, 'mse': mses, 'rmse': rmses, 'corrs': corrs, 'ps': ps})
    
    display(dfMetrics)
    
    
    print("Average over all folds:")
    print("%6s %.2f" % ("MAE:", np.mean(dfMetrics['mae'])))
    print("%6s %.2f" % ("MSE:", np.mean(dfMetrics['mse'])))
    print("%6s %.2f" % ("RMSE:", np.mean(dfMetrics['rmse'])))
    print("%6s %.2f" % ("Corr:", np.mean(dfMetrics['corrs'])))
    print("%6s %.7f" % ("p-val:", np.mean(dfMetrics['ps'])))

    # Get median fold
    med_corr_idx = np.argsort(corrs)[len(corrs)//2]
    print("\nMedian fold number: ", med_corr_idx+1)
    
    
    
print("\n\n***** Now running median fold ......")

fold_idx = med_corr_idx
test_fold = test_folds[fold_idx]
train_fold = train_folds[fold_idx]

print("\tFold: ", fold_idx+1)

# Get data for teams in test set
test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'task_score')
X_test = test_data_list[0]
y_test = test_data_list[1]      

# Get data for teams in train set
train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'task_score')
X_train = train_data_list[0]
y_train = train_data_list[1]

# Train model
model_task_score.fit(X_train, y_train)

coeff_df = pd.DataFrame(model_task_score.coef_, X_train.columns, columns=['Coefficient'])
display(coeff_df)


if shuffled:
    coeff_df.to_csv(TASK_SCORE_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_Median_iter_fold_LR_coeffs_SHUFF.csv")
elif chance:
    coeff_df.to_csv(TASK_SCORE_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_Median_iter_fold_LR_coeffs_CHANCE.csv")
else:
    coeff_df.to_csv(TASK_SCORE_RESULTS + "RAW/" + model + "/" + model + "_Median_iter_fold_LR_coeffs.csv")

        
        

Median iteration number:  6
	Fold:  1
	Fold:  2
	Fold:  3
	Fold:  4
	Fold:  5
	Fold:  6
	Fold:  7
	Fold:  8
	Fold:  9
	Fold:  10


Unnamed: 0,fold,mae,mse,rmse,corrs,ps
0,1,2.916833,12.460025,3.529876,-0.122763,0.525802
1,2,2.713845,11.907378,3.450707,-0.044513,0.818653
2,3,3.00078,12.818883,3.580347,0.034082,0.858101
3,4,3.057397,13.492748,3.673248,0.135815,0.499382
4,5,2.231021,6.437665,2.537255,-0.080026,0.697564
5,6,2.869123,13.556508,3.681916,0.348407,0.081108
6,7,2.412263,8.390828,2.896693,-0.203891,0.328281
7,8,3.229915,18.392158,4.288608,-0.039042,0.846692
8,9,2.586288,10.148909,3.185735,-0.032661,0.876832
9,10,3.242146,14.438409,3.799791,0.133655,0.506293


Average over all folds:
  MAE: 2.83
  MSE: 12.20
 RMSE: 3.46
 Corr: 0.01
p-val: 0.6038708

Median fold number:  9


***** Now running median fold ......
	Fold:  9


Unnamed: 0,Coefficient
DET,0.122843
ADL,2.052089
MDL,-0.573609
DENTR,8.340108
LAM,-0.259254
AVL,-2.46483
MVL,0.556438
VENTR,-7.613451


# Predict Subjective Outcome (combined CPS and ITN score)

In [57]:
# Resources
## https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

# Store metrics for all iterations
# aurocs = []
maes = []    # MAE (mean absolute errors)
mses = []    # MSE (mean squared errors)
rmses = []   # RMSE (root mean squared errors)
corrs = []   # spearman correlations
ps = []      # spearman correlation p-values

all_y_test_subj_out = [[] for i in range(num_iters)]
predictions_subj_out = [[] for i in range(num_iters)]

# For storing shap values across all folds
subj_out_shap_values_0 = None
subj_out_shap_values_1 = None
subj_out_full_X_test = pd.DataFrame()

#-----------------------------------------------#
#      5-fold team level cross-validation       #
#-----------------------------------------------#

# For each iteration 
for i in range(num_iters):
    print("Iteration: ", i+1)
    
    # Lists for cumulative test set and predictions for iteration
    dfFullTest = pd.DataFrame()
    all_y_test = []
    predictions = []
    
    # Create model for task_score prediction
    if model == "RFR":
        model_subj_out = RandomForestRegressor(n_estimators=100, random_state=1, \
                                           max_features='sqrt') 
    elif model == "SVR":
        model_subj_out = SVR()
        
    elif model == "LR":
        model_subj_out = LinearRegression()
    
    # Get fold groups
    fold_groups = folds_dict_list[i]
    
    # For each fold
    for j, (test_fold, train_fold) in enumerate(zip(test_folds, train_folds)):
#         print("\tFold: ", j+1)
        # Get data for teams in test set
        test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'CPS_and_ITN_mean')
        X_test = test_data_list[0]
        y_test = test_data_list[1]
        all_y_test.extend(y_test.tolist())
        dfFullTest = pd.concat([dfFullTest, test_data_list[2]], ignore_index=True)
        
        # Get data for teams in train set
        train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'CPS_and_ITN_mean')
        X_train = train_data_list[0]
        y_train = train_data_list[1]
        

        # Train model
        model_subj_out.fit(X_train, y_train)

        # Test model
        y_pred = model_subj_out.predict(X_test)
        predictions.extend(y_pred.tolist())
        
#         ## Get SHAP values
#         explainer = shap.TreeExplainer(model_subj_outrfc_subj_out)
#         shap_values = explainer.shap_values(X_test)
        
#         if j==0:
#             subj_out_shap_values_0 = shap_values[0]
#             subj_out_shap_values_1 = shap_values[1]
#         else:
#             subj_out_shap_values_0 = np.vstack([subj_out_shap_values_0, shap_values[0]])
#             subj_out_shap_values_1 = np.vstack([subj_out_shap_values_1, shap_values[1]])
#         subj_out_full_X_test = pd.concat([subj_out_full_X_test, X_test], ignore_index=True)
#         ## End of get SHAP values

# ----- END OF FOLDS

    # Calculate the absolute errors (MAE) of the iteration
    predictions = np.array(predictions)
    all_y_test = np.array(all_y_test)
    mae = metrics.mean_absolute_error(all_y_test, predictions)
    mse = metrics.mean_squared_error(all_y_test, predictions)
    rmse = np.sqrt(metrics.mean_squared_error(all_y_test, predictions))
    corr, p = spearmanr(all_y_test, predictions)
    maes.append(mae)
    mses.append(mse)
    rmses.append(rmse)
    corrs.append(corr)
    ps.append(p)
    
    # Save iteration subjective outcome truth labels, and predictions for stats across all iterations
    all_y_test_subj_out[i] = all_y_test
    predictions_subj_out[i] = predictions

    # Save actual labels and predictions for iteration
    dfTruevPred = dfFullTest.loc[:, ['GROUPID', 'block', 'CPS_and_ITN_mean']]
    dfTruevPred['prediction'] = predictions
    dfTruevPred.sort_values(['GROUPID', 'block', 'CPS_and_ITN_mean'], ignore_index=True, inplace=True)
    dfTruevPred['error'] = abs(dfTruevPred['prediction'] - dfTruevPred['CPS_and_ITN_mean'])
#     print("MAE from df: ", round(np.mean(dfTruevPred['error']), 2))
    
    if shuffled:
        dfTruevPred.to_csv(SUBJ_OUTCOME_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_SubjOut_True_vs_Pred_SHUFF_" + str(i+1) + ".csv", index=False)
    elif chance:
        dfTruevPred.to_csv(SUBJ_OUTCOME_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_SubjOut_True_vs_Pred_CHANCE_" + str(i+1) + ".csv", index=False)
    else:
        dfTruevPred.to_csv(SUBJ_OUTCOME_RESULTS + "RAW/" + model + "/" + model + "_SubjOut_True_vs_Pred_" + str(i+1) + ".csv", index=False)
    
#     # FOR PLOTTING ACTUAL vs. PREDICTED    
#     dfTruevPred.plot(y=['CPS_and_ITN_mean', 'prediction'], title='Actual vs. Predicted Subjective Outcome', \
#                      style=['b-', 'ro'], figsize=(20, 5))
#     # END PLOTTING
    
#     if model == "RFR":
#         ### Compute Feature Importances for last fold iteration ###
#         # Impurity-based importances
#         importances = model_subj_out.feature_importances_
#         std = np.std([tree.feature_importances_ for tree in model_subj_out.estimators_], axis=0)
#         imps = pd.Series(importances, index=features)
#         stds = pd.Series(std, index=features)
#         ## Plot feature importances
#         fig1, ax1 = plt.subplots()
#         imps.plot.bar(yerr=std, ax=ax1)
#         ax1.set_title("CPS and ITN Feature importances using MDI")
#         ax1.set_ylabel("Mean decrease in impurity")
#         plt.show()
#         ## END plotting feature importances
    
    
# ----- END OF ITERATIONS

print("\n =========== ALL ITERATIONS RESULTS SUMMARY ===========")
dfMetrics = pd.DataFrame({'iteration': [i for i in range(1,num_iters+1)], \
                          'mae': maes, 'mse': mses, 'rmse': rmses, 'corrs': corrs, 'ps': ps})
display(dfMetrics)

if shuffled:
    dfMetrics.to_csv(SUBJ_OUTCOME_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_SubjOut_Metrics_SHUFF.csv", index=False)
elif chance:
    dfMetrics.to_csv(SUBJ_OUTCOME_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_SubjOut_Metrics_CHANCE.csv", index=False)
else:
    dfMetrics.to_csv(SUBJ_OUTCOME_RESULTS + "RAW/" + model + "/" + model + "_SubjOut_Metrics.csv", index=False)

print("Average over all iterations:")
print("%6s %.2f" % ("MAE:", np.mean(dfMetrics['mae'])))
print("%6s %.2f" % ("MSE:", np.mean(dfMetrics['mse'])))
print("%6s %.2f" % ("RMSE:", np.mean(dfMetrics['rmse'])))
print("%6s %.2f" % ("Corr:", np.mean(dfMetrics['corrs'])))
print("%6s %.7f" % ("p-val:", np.mean(dfMetrics['ps'])))



Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25



Unnamed: 0,iteration,mae,mse,rmse,corrs,ps
0,1,0.430072,0.30122,0.548835,0.033603,0.581785
1,2,0.427457,0.29498,0.543121,0.081537,0.180802
2,3,0.438217,0.30954,0.556364,0.013585,0.823838
3,4,0.432567,0.298368,0.54623,0.005599,0.926906
4,5,0.432642,0.304228,0.551569,0.021564,0.723799
5,6,0.431831,0.302675,0.550159,0.058414,0.338067
6,7,0.434939,0.305555,0.55277,0.02187,0.720035
7,8,0.434984,0.30449,0.551806,0.020602,0.735655
8,9,0.432846,0.303944,0.551311,0.025261,0.67888
9,10,0.429685,0.294212,0.542413,0.061532,0.312869


Average over all iterations:
  MAE: 0.43
  MSE: 0.30
 RMSE: 0.55
 Corr: 0.02
p-val: 0.6681020


# Predict Valence

In [63]:
# Resources
## https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

# Store metrics for all iterations
# aurocs = []
maes = []    # MAE (mean absolute errors)
mses = []    # MSE (mean squared errors)
rmses = []   # RMSE (root mean squared errors)
corrs = []   # spearman correlations
ps = []      # spearman correlation p-values

all_y_test_valence = [[] for i in range(num_iters)]
predictions_valence = [[] for i in range(num_iters)]

# For storing shap values across all folds
valence_shap_values_0 = None
valence_shap_values_1 = None
valence_full_X_test = pd.DataFrame()

#-----------------------------------------------#
#      5-fold team level cross-validation       #
#-----------------------------------------------#

# For each iteration 
for i in range(num_iters):
    print("Iteration: ", i+1)
    
    # Lists for cumulative test set and predictions for iteration
    dfFullTest = pd.DataFrame()
    all_y_test = []
    predictions = []
    
    # Create model for task_score prediction
    if model == "RFR":
        model_valence = RandomForestRegressor(n_estimators=100, random_state=1, \
                                           max_features='sqrt') 
    elif model == "SVR":
        model_valence = SVR()
        
    elif model == "LR":
        model_valence = LinearRegression()
    
    # Get fold groups
    fold_groups = folds_dict_list[i]
    
    # For each fold
    for j, (test_fold, train_fold) in enumerate(zip(test_folds, train_folds)):
#         print("\tFold: ", j+1)
        # Get data for teams in test set
        test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'Valence')
        X_test = test_data_list[0]
        y_test = test_data_list[1]
        all_y_test.extend(y_test.tolist())
        dfFullTest = pd.concat([dfFullTest, test_data_list[2]], ignore_index=True)
        
        # Get data for teams in train set
        train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'Valence')
        X_train = train_data_list[0]
        y_train = train_data_list[1]
        

        # Train model
        model_valence.fit(X_train, y_train)

        # Test model
        y_pred = model_valence.predict(X_test)
        predictions.extend(y_pred.tolist())
        
#         ## Get SHAP values
#         explainer = shap.TreeExplainer(model_valencerfc_valence)
#         shap_values = explainer.shap_values(X_test)
        
#         if j==0:
#             valence_shap_values_0 = shap_values[0]
#             valence_shap_values_1 = shap_values[1]
#         else:
#             valence_shap_values_0 = np.vstack([valence_shap_values_0, shap_values[0]])
#             valence_shap_values_1 = np.vstack([valence_shap_values_1, shap_values[1]])
#         valence_full_X_test = pd.concat([valence_full_X_test, X_test], ignore_index=True)
#         ## End of get SHAP values

# ----- END OF FOLDS

    # Calculate the absolute errors (MAE) of the iteration
    predictions = np.array(predictions)
    all_y_test = np.array(all_y_test)
    mae = metrics.mean_absolute_error(all_y_test, predictions)
    mse = metrics.mean_squared_error(all_y_test, predictions)
    rmse = np.sqrt(metrics.mean_squared_error(all_y_test, predictions))
    corr, p = spearmanr(all_y_test, predictions)
    maes.append(mae)
    mses.append(mse)
    rmses.append(rmse)
    corrs.append(corr)
    ps.append(p)
    
    # Save iteration valence truth labels, and predictions for stats across all iterations
    all_y_test_subj_out[i] = all_y_test
    predictions_subj_out[i] = predictions

    # Save actual labels and predictions for iteration
    dfTruevPred = dfFullTest.loc[:, ['GROUPID', 'block', 'Valence']]
    dfTruevPred['prediction'] = predictions
    dfTruevPred.sort_values(['GROUPID', 'block', 'Valence'], ignore_index=True, inplace=True)
    dfTruevPred['error'] = abs(dfTruevPred['prediction'] - dfTruevPred['Valence'])
#     print("MAE from df: ", round(np.mean(dfTruevPred['error']), 2))
    
    if shuffled:
        dfTruevPred.to_csv(VALENCE_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_Valence_True_vs_Pred_SHUFF_" + str(i+1) + ".csv", index=False)
    elif chance:
        dfTruevPred.to_csv(VALENCE_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_Valence_True_vs_Pred_CHANCE_" + str(i+1) + ".csv", index=False)
    else:
        dfTruevPred.to_csv(VALENCE_RESULTS + "RAW/" + model + "/" + model + "_Valence_True_vs_Pred_" + str(i+1) + ".csv", index=False)
    
#     # FOR PLOTTING ACTUAL vs. PREDICTED   
#     dfTruevPred.plot(y=['Valence', 'prediction'], title='Actual vs. Predicted Valence', \
#                      style=['b-', 'ro'], figsize=(20, 5))
#     # END PLOTTING
    
#     if model == "RFR":
#         ### Compute Feature Importances for last fold iteration ###
#         # Impurity-based importances
#         importances = model_valence.feature_importances_
#         std = np.std([tree.feature_importances_ for tree in model_valence.estimators_], axis=0)
#         imps = pd.Series(importances, index=features)
#         stds = pd.Series(std, index=features)
#         ## Plot feature importances
#         fig1, ax1 = plt.subplots()
#         imps.plot.bar(yerr=std, ax=ax1)
#         ax1.set_title("Valence Feature importances using MDI")
#         ax1.set_ylabel("Mean decrease in impurity")
#         plt.show()
#         ## END plotting feature importances
    
    
# ----- END OF ITERATIONS

print("\n =========== ALL ITERATIONS RESULTS SUMMARY ===========")
dfMetrics = pd.DataFrame({'iteration': [i for i in range(1,num_iters+1)], \
                          'mae': maes, 'mse': mses, 'rmse': rmses, 'corrs': corrs, 'ps': ps})
display(dfMetrics)

if shuffled:
    dfMetrics.to_csv(VALENCE_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_Valence_Metrics_SHUFF.csv", index=False)
elif chance:
    dfMetrics.to_csv(VALENCE_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_Valence_Metrics_CHANCE.csv", index=False)
else:
    dfMetrics.to_csv(VALENCE_RESULTS + "RAW/" + model + "/" + model + "_Valence_Metrics.csv", index=False)

print("Average over all iterations:")
print("%6s %.2f" % ("MAE:", np.mean(dfMetrics['mae'])))
print("%6s %.2f" % ("MSE:", np.mean(dfMetrics['mse'])))
print("%6s %.2f" % ("RMSE:", np.mean(dfMetrics['rmse'])))
print("%6s %.2f" % ("Corr:", np.mean(dfMetrics['corrs'])))
print("%6s %.7f" % ("p-val:", np.mean(dfMetrics['ps'])))



Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25



Unnamed: 0,iteration,mae,mse,rmse,corrs,ps
0,1,0.579971,0.505046,0.710666,0.173524,0.004168
1,2,0.577333,0.502446,0.708834,0.172725,0.004348
2,3,0.569325,0.488178,0.698698,0.201573,0.000846
3,4,0.576357,0.4997,0.706894,0.177389,0.00339
4,5,0.574005,0.497723,0.705495,0.177634,0.003345
5,6,0.574468,0.502187,0.708652,0.163698,0.006921
6,7,0.576187,0.498469,0.706023,0.178442,0.003202
7,8,0.577076,0.503183,0.709354,0.151913,0.012287
8,9,0.573536,0.499623,0.70684,0.179149,0.003081
9,10,0.582156,0.511069,0.714891,0.149972,0.013457


Average over all iterations:
  MAE: 0.58
  MSE: 0.50
 RMSE: 0.71
 Corr: 0.17
p-val: 0.0062404


## without cross-validation

In [None]:
features = ['REC', 'DET', 'ADL', 'MDL', 'DENTR', 'LAM', 'AVL', 'MVL', 'VENTR']

X = dfData.loc[:, features]  
y = dfData.loc[:, 'task_score']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

model_task_score = RandomForestRegressor(n_estimators=100, random_state=1, max_features='sqrt')


# Train model
model_task_score.fit(X_train, y_train)
    
# Predict    
y_pred = model_task_score.predict(X_test)



mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print("%5s %.2f" % ("MAE:", mae))
print("%5s %.2f" % ("MSE:", mse))
print("%5s %.2f" % ("RMSE:", rmse))






# Classification

## Predict Task Score 

### Binary

In [10]:
# Resources
## https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

# Store metrics for all iterations
aurocs = []
precision = []
recall = []

all_y_test_task_score = [[] for i in range(num_iters)]
predictions_task_score = [[] for i in range(num_iters)]
predict_proba_task_score = [[] for i in range(num_iters)]

# For storing shap values across all folds
task_score_shap_values_0 = None
task_score_shap_values_1 = None
task_score_full_X_test = pd.DataFrame()

#-----------------------------------------------#
#      5-fold team level cross-validation       #
#-----------------------------------------------#

# For each iteration 
for i in range(num_iters):
    print("Iteration: ", i+1)
    
    # Lists for cumulative test set and predictions for iteration
    dfFullTest = pd.DataFrame()
    all_y_test = []
    predictions = []
    predict_proba = []
    
    # Create model for task_score prediction
    if model == "RFC":
        model_task_score = RandomForestClassifier(n_estimators=100, random_state=1, max_features='sqrt') 

    
    # Get fold groups
    fold_groups = folds_dict_list[i]
    
    # For each fold
    for j, (test_fold, train_fold) in enumerate(zip(test_folds, train_folds)):
#         print("\tFold: ", j+1)
        # Get data for teams in test set
        test_data_list = get_group_data(dfData, fold_groups[test_fold], features, 'task_score_bin')
        X_test = test_data_list[0]
        y_test = test_data_list[1]
        all_y_test.extend(y_test.tolist())

        dfFullTest = pd.concat([dfFullTest, test_data_list[2]], ignore_index=True)      
        
        # Get data for teams in train set
        train_data_list = get_group_data(dfData, fold_groups[train_fold], features, 'task_score_bin')
        X_train = train_data_list[0]
        y_train = train_data_list[1]

        # Train model
        model_task_score.fit(X_train, y_train)

        # Test model
        y_pred = model_task_score.predict(X_test)
        predictions.extend(y_pred.tolist())

        y_pp = model_task_score.predict_proba(X_test)[:, 1] 
        predict_proba.extend(y_pp.tolist())

        
#         ## Get SHAP values
#         explainer = shap.TreeExplainer(model_task_scorerfc_task_score)
#         shap_values = explainer.shap_values(X_test)
        
#         if j==0:
#             task_score_shap_values_0 = shap_values[0]
#             task_score_shap_values_1 = shap_values[1]
#         else:
#             task_score_shap_values_0 = np.vstack([task_score_shap_values_0, shap_values[0]])
#             task_score_shap_values_1 = np.vstack([task_score_shap_values_1, shap_values[1]])
#         task_score_full_X_test = pd.concat([task_score_full_X_test, X_test], ignore_index=True)
#         ## End of get SHAP values

# ----- END OF FOLDS
    
    all_y_test = np.array(all_y_test)
    all_y_test_task_score[i] = all_y_test

    predictions = np.array(predictions)
    predict_proba = np.array(predict_proba)
    
    predictions_task_score[i] = predictions
    predict_proba_task_score[i] = predict_proba
    
    
    # Get AUROC of iteration
    auroc = roc_auc_score(all_y_test, predict_proba)
    prec = precision_score(all_y_test, predictions)
    rec = recall_score(all_y_test, predictions)
    
    aurocs.append(auroc)
    precision.append(prec)
    recall.append(rec)

     # Save actual labels and predictions for iteration
    dfTruevPred = dfFullTest.loc[:, ['GROUPID', 'block', 'task_score', 'task_score_bin']]
    dfTruevPred['prediction'] = predictions
    dfTruevPred['predict_proba'] = predict_proba
    dfTruevPred.sort_values(['GROUPID', 'block', 'task_score'], ignore_index=True, inplace=True)

    # FOR PLOTTING ACTUAL vs. PREDICTED: https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea   
#     cf_matrix = confusion_matrix(dfTruevPred['task_score_bin'], dfTruevPred['prediction'])
#     sns.heatmap(cf_matrix, annot=True)
    # END PLOTTING
        
    
    if shuffled:
        dfTruevPred.to_csv(TASK_SCORE_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_TaskScore_True_vs_Pred_SHUFF_" + str(i+1) + ".csv", index=False)
    elif chance:
        dfTruevPred.to_csv(TASK_SCORE_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_TaskScore_True_vs_Pred_CHANCE_" + str(i+1) + ".csv", index=False)
    else:
        dfTruevPred.to_csv(TASK_SCORE_RESULTS + "RAW/" + model + "/" + model + "_TaskScore_True_vs_Pred_" + str(i+1) + ".csv", index=False)
  

    
#     if (model == "RFR") or (model == "RFC"):    
#         ### Compute Feature Importances for last fold iteration ###
#         # Impurity-based importances
#         importances = model_task_score.feature_importances_
#         std = np.std([tree.feature_importances_ for tree in model_task_score.estimators_], axis=0)
#         imps = pd.Series(importances, index=features)
#         stds = pd.Series(std, index=features)
#         ## Plot feature importances
#         fig1, ax1 = plt.subplots()
#         imps.plot.bar(yerr=std, ax=ax1)
#         ax1.set_title("Task Score Feature importances using MDI")
#         ax1.set_ylabel("Mean decrease in impurity")
#         plt.show()
#         ## END plotting feature importances
    
    
# ----- END OF ITERATIONS

print("\n =========== ALL ITERATIONS RESULTS SUMMARY ===========")
dfMetrics = pd.DataFrame({'iteration': [i for i in range(1,num_iters+1)], \
                          'auroc': aurocs, 'precision': precision, 'recall': recall})

display(dfMetrics)

if shuffled:
    dfMetrics.to_csv(TASK_SCORE_RESULTS + "RAW_SHUFFLED/" + model + "/" + model + "_TaskScore_Metrics_SHUFF.csv", index=False)
elif chance:
    dfMetrics.to_csv(TASK_SCORE_RESULTS + "RAW_CHANCE/" + model + "/" + model + "_TaskScore_Metrics_CHANCE.csv", index=False)
else:
    dfMetrics.to_csv(TASK_SCORE_RESULTS + "RAW/" + model + "/" + model + "_TaskScore_Metrics.csv", index=False)

print("Averages: ")
print("%12s %.2f" % ("AUROC:", np.mean(dfMetrics['auroc'])))
print("%12s %.2f" % ("Precision:", np.mean(dfMetrics['precision'])))
print("%12s %.2f" % ("Recall:", np.mean(dfMetrics['recall'])))


print("%6s %.2f" % ("Med AUROC:", np.median(dfMetrics['auroc'])))

# Get median iterations
med_auroc = np.median(aurocs)
med_auroc_idx = np.argsort(aurocs)[len(aurocs)//2]

# Plot confusion matrix of median iteration
cf_matrix = confusion_matrix(all_y_test_task_score[med_auroc_idx], predictions_task_score[med_auroc_idx])
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Zero', 'One']
make_confusion_matrix(cf_matrix, 
                      group_names=labels,
                      categories=categories, 
                      cmap='Blues')


    

Iteration:  1
Y_PP:  [[0.35 0.65]
 [0.65 0.35]
 [0.88 0.12]
 [0.44 0.56]
 [0.21 0.79]
 [0.98 0.02]
 [0.23 0.77]
 [0.54 0.46]
 [0.73 0.27]
 [0.37 0.63]
 [0.67 0.33]
 [0.83 0.17]
 [0.35 0.65]
 [0.19 0.81]
 [0.69 0.31]
 [0.36 0.64]
 [0.43 0.57]
 [0.21 0.79]
 [0.45 0.55]
 [0.79 0.21]
 [0.57 0.43]
 [0.1  0.9 ]
 [0.43 0.57]
 [0.95 0.05]
 [0.34 0.66]
 [0.26 0.74]
 [0.47 0.53]
 [0.3  0.7 ]
 [0.53 0.47]
 [0.11 0.89]]
Y_PP:  [[0.36 0.64]
 [0.48 0.52]
 [0.59 0.41]
 [0.49 0.51]
 [0.91 0.09]
 [0.3  0.7 ]
 [1.   0.  ]
 [0.11 0.89]
 [0.97 0.03]
 [0.47 0.53]
 [0.73 0.27]
 [0.49 0.51]
 [0.92 0.08]
 [0.93 0.07]
 [0.42 0.58]
 [0.98 0.02]
 [0.89 0.11]
 [0.76 0.24]
 [0.45 0.55]
 [0.34 0.66]
 [0.66 0.34]
 [0.91 0.09]
 [0.63 0.37]
 [0.83 0.17]
 [0.44 0.56]
 [0.83 0.17]
 [0.36 0.64]
 [0.18 0.82]
 [0.76 0.24]]
Y_PP:  [[0.84 0.16]
 [0.87 0.13]
 [0.49 0.51]
 [0.52 0.48]
 [0.24 0.76]
 [0.58 0.42]
 [0.93 0.07]
 [0.45 0.55]
 [0.53 0.47]
 [0.52 0.48]
 [0.37 0.63]
 [0.52 0.48]
 [0.76 0.24]
 [0.51 0.49]
 [0.3  0.7 ]
 

ValueError: y should be a 1d array, got an array of shape (271, 2) instead.

In [15]:
# Get median iterations
med_auroc = np.median(aurocs)
med_auroc_idx = np.argsort(aurocs)[len(aurocs)//2]

print(all_y_test_task_score[med_auroc_idx].shape)
print(predictions_task_score[med_auroc_idx].shape)

# # Plot confusion matrix of median iteration
# cf_matrix = confusion_matrix(all_y_test[med_auroc_idx], predictions_task_score[med_auroc_idx])
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(cf_matrix, 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')



(271,)
(271,)


### OHE labels

### Categorical

In [None]:
features = ['REC', 'DET', 'ADL', 'MDL', 'DENTR', 'LAM', 'AVL', 'MVL', 'VENTR']

X = dfData.loc[:, features]  
y = dfData.loc[:, 'task_score_cat']

# y_bin = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

model_task_score = RandomForestClassifier(n_estimators=100, random_state=1, \
                                           max_features='sqrt', class_weight='balanced') 

# Train model
model_task_score.fit(X_train, y_train)
    
    
y_pred = model_task_score.predict(X_test)

# display(pd.DataFrame({"y_test": y_test, "y_pred": y_pred}))

print(classification_report(y_test, y_pred, zero_division=1))

cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True)


# #roc auc score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
# y_pp = model_task_score.predict_proba(X_test) 
# print("\ny_train info")
# print("total len: ", len(y_train))
# print("unique vals: ", len(set(y_train)))
# print("\ny_test info")
# print("total len: ", len(y_test))
# print("unique vals: ", len(set(y_test)))
# print("\ny_pp info")
# print("y_pp shape: ", y_pp.shape)

# auroc = roc_auc_score(y_test, y_pp, multi_class='ovo', average='weighted')
# print(auroc)



# Hyperparameter Tuning

## Random Search CV

### Classification

In [None]:
features = ['REC', 'DET', 'ADL', 'MDL', 'DENTR', 'LAM', 'AVL', 'MVL', 'VENTR']

X = dfData.loc[:, features]  
y = dfData.loc[:, 'task_score_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
print("\n")


## Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

pprint(rf_random.best_params_)



base_model = RandomForestClassifier(n_estimators=100, random_state=1, max_features='sqrt') 
base_model.fit(X_train, y_train)
base_predictions = base_model.predict(X_test)

print('Base Model Performance')
print(classification_report(y_test, base_predictions, zero_division=1))


best_random = rf_random.best_estimator_
random_predictions = best_random.predict(X_test)

print('Random Model Performance')
print(classification_report(y_test, random_predictions, zero_division=1))



### Regression

In [None]:
# Resources
## https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Split data
X_train, X_test, y_train, y_test = train_test_split(dfData.loc[:, features], dfData.loc[:, 'task_score'], \
                                                    test_size=0.2, random_state=42)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test).reshape(-1, 1)

if model == "RFR":
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    pprint(random_grid)
    print("\n")
    
    ## Use the random grid to search for best hyperparameters
    
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)

    pprint(rf_random.best_params_)

    base_model = RandomForestRegressor(n_estimators=100, random_state=1, max_features='sqrt') 
    base_model.fit(X_train, y_train)
    base_predictions = base_model.predict(X_test)
    base_mae = metrics.mean_absolute_error(y_test, base_predictions)

    print("===== Random Forest Regessor =====")
    print('Base Model Performance')
    print('MAE: ', np.round(base_mae, 2))


    best_random = rf_random.best_estimator_
    random_predictions = best_random.predict(X_test)
    random_mae = metrics.mean_absolute_error(y_test, random_predictions)

    print('Random Model Performance')
    print('MAE: ', np.round(random_mae, 2))


    print('Improvement of {:0.2f}%.'.format( 100 * (random_mae - base_mae) / base_mae))
    
elif model == "SVR":
    kernel = ['linear', 'poly', 'rbf', 'sigmoid']
    
    degree = [1, 2, 3, 4, 5]
    
    gamma = ['scale', 'auto', 0.1]
    
    # Create the random grid
    random_grid = {'kernel': kernel,
                   'degree': degree,
                   'gamma': gamma}
    pprint(random_grid)
    print("\n")
    
    ## Use the random grid to search for best hyperparameters
    
    # First create the base model to tune
    sv = SVR()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    sv_random = RandomizedSearchCV(estimator = sv, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    sv_random.fit(X_train, y_train)

    pprint(sv_random.best_params_)

    base_model = SVR() 
    base_model.fit(X_train, y_train)
    base_predictions = base_model.predict(X_test)
    base_mae = metrics.mean_absolute_error(y_test, base_predictions)

    print("===== Support Vector Machine Regessor =====")
    print('Base Model Performance')
    print('MAE: ', np.round(base_mae, 2))


    best_random = sv_random.best_estimator_
    random_predictions = best_random.predict(X_test)
    random_mae = metrics.mean_absolute_error(y_test, random_predictions)

    print('Random Model Performance')
    print('MAE: ', np.round(random_mae, 2))


    print('Improvement of {:0.2f}%.'.format( 100 * (random_mae - base_mae) / base_mae))



In [8]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
# # Define fold names
# train_folds = []
# test_folds = []
# set_type = "test"
# for j in range(1,num_folds+1): 
#     col_name = "Fold" + str(j) + "_" + set_type
#     test_folds.append(col_name)
#     set_type = "train"  
#     col_name = "Fold" + str(j) + "_" + set_type
#     train_folds.append(col_name)
#     set_type = "test"

# # print("Train fold names: ", train_folds)
# # print("Test fold names: ",test_folds)


# folds_dict_list = []

# # Split teams into 5 groups
# teams = pd.unique(dfMeasures.GROUPID)

# # For every iteration
# for i in range(1,num_iters+1):
#     print("\n\n=============Iteration: ", i)
#     random.Random(i).shuffle(teams)

# #     teams = shuffle(teams, random_state=i)

#     groups = np.array_split(teams, num_folds)
#     print("\ngroups: ")
#     for k,grp in enumerate(groups):
#         print(k, grp)
    
#     # Define groups for each fold
#     fold_groups = {}
#     for j, (train_fold, test_fold) in enumerate(zip(train_folds, test_folds)):
#         # make the current group the test group
#         fold_groups[test_fold] = groups[j]
#         # make all other groups the train group
#         train_group = groups[:j] + groups[j+1:]
#         train_group = [team for group in train_group for team in group]
#         fold_groups[train_fold] = train_group
        
#     ## Confirm that for each fold, there is no team overlap bewteen train and test set
#     for j in range(1,num_folds+1):
#         assert set(fold_groups['Fold'+str(j)+'_test']).isdisjoint(set(fold_groups['Fold'+str(j)+'_train'])), "There is overlap in train and test set " + str(j)
    
#     print("* No team overlap *")
    
#     print("\n!!!!! FOLD GROUPS BEFORE IT GOES BAD")
#     for kei in fold_groups:
#         print("\n", kei)
#         print(fold_groups[kei])

        
#     if i == 2:
#         print("\n BEFORE APPEND folds_dict_list[i-2] index = ", i-2)
#         folds_dict = folds_dict_list[i-2]
#         for key in folds_dict:
#             print("\n", key)
#             print(folds_dict[key])
      
#     # Add fold groups to dictionary
#     folds_dict_list.append(fold_groups)
    
#     if i == 2:
#         print("\n AFTER APPEND folds_dict_list[i-2] index = ", i-2)
#         folds_dict = folds_dict_list[i-2]
#         for key in folds_dict:
#             print("\n", key)
#             print(folds_dict[key])   

# # Informational
# # print("\nNumber of iterations: ", len(folds_dict_list))

# print("\n*~*~*~*~*~* iterate through folds_dict_list: ")
# for i,dicti in enumerate(folds_dict_list):
#     print("\n list index: ", i)
#     for j in range(1,num_folds+1):
#         assert set(dicti['Fold'+str(j)+'_test']).isdisjoint(set(dicti['Fold'+str(j)+'_train'])), "There is overlap in train and test set " + str(j)
    
    
