In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix
import itertools
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
import seaborn as sns
import shap 
sns.set_style('darkgrid')

# Define file paths

In [2]:
MEASURES_FILE = "MdRQA_measures_prelim.csv"
SHUFF_MEASURES_FILE = "shuff_MdRQA_measures_prelim.csv"
LABELS_FILE = "team_block_outcomes.csv"


# Load data

In [10]:
dfMeasures = pd.read_csv(MEASURES_FILE)
dfShuffMeasures = pd.read_csv(SHUFF_MEASURES_FILE)
dfLabels = pd.read_csv(LABELS_FILE)


# dfTaskScore = dfLabels.loc[:, ['GROUPID', 'block', 'task_score']]
# dfSubjectiveOutcomes = dfLabels.loc[:, ['GROUPID', 'block', 'CPS_and_ITN_mean']]
# dfValence = dfLabels.loc[:, ['GROUPID', 'block', 'Valence']]

print("%s shape: %s" % (MEASURES_FILE, dfMeasures.shape))
print("%s shape: %s" % (SHUFF_MEASURES_FILE, dfShuffMeasures.shape))
print("%s shape: %s" % (LABELS_FILE, dfLabels.shape))


MdRQA_measures_prelim.csv shape: (271, 11)
shuff_MdRQA_measures_prelim.csv shape: (271, 11)
team_block_outcomes.csv shape: (274, 7)


# Experiment Set Up

In [19]:
# Get model type
shuffled = False
num_iters = 2
num_folds = 5

if shuffled:
    dfData = dfShuffMeasures
    dfLabels = pd.merge(dfLabels, dfMeasures, on=['GROUPID', 'block'], how='inner')\
           [['GROUPID', 'block', 'CPS_and_ITN_mean', 'Valence', 'task_score']]
else:
    dfData = dfMeasures
    dfLabels = pd.merge(dfLabels, dfShuffMeasures, on=['GROUPID', 'block'], how='inner')\
           [['GROUPID', 'block', 'CPS_and_ITN_mean', 'Valence', 'task_score']]

print("dfData shape: ", dfData.shape)
print("dfLabels shape: ", dfLabels.shape)


dfData shape:  (271, 11)
dfLabels shape:  (271, 5)


## Prep for team-level cross validation

In [9]:
# Define fold names
train_folds = []
test_folds = []
set_type = "test"
for i in range(1,num_folds+1): 
    col_name = "Fold" + str(i) + "_" + set_type
    test_folds.append(col_name)
    set_type = "train"  
    col_name = "Fold" + str(i) + "_" + set_type
    train_folds.append(col_name)
    set_type = "test"

print("Train fold names: ", train_folds)
print("\nTest fold names: ",test_folds)


folds_dict_list = []

# Split teams into 5 groups
teams = pd.unique(dfMeasures.GROUPID)

for j in range(num_iters):
    random.Random(j).shuffle(teams)
    groups = np.array_split(teams, 5)
    
    # Define groups for each fold
    fold_groups = {}
    for i, (train_fold, test_fold) in enumerate(zip(train_folds, test_folds)):
        # make the current group the test group
        fold_groups[test_fold] = groups[i]
        # make all other groups the train group
        train_group = groups[:i] + groups[i+1:]
        train_group = [team for group in train_group for team in group]
        fold_groups[train_fold] = train_group
        
    ## Confirm that for each fold, there is no team overlap bewteen train and test set
    for i in range(1,num_folds+1):
        assert set(fold_groups['Fold'+str(i)+'_test']).isdisjoint(set(fold_groups['Fold'+str(i)+'_train'])), "There is overlap in train and test set " + str(i)
    
    print("* No team overlap *")
    
    # Add fold groups to dictionary
    folds_dict_list.append(fold_groups)

# Informational
print("Number of iterations: ", len(folds_dict_list))
# for fold_groups in folds_dict_list:
#     for key in fold_groups:
#         print("\n", key)
#         print(fold_groups[key])
#         print("Num groups: ", len(fold_groups[key]))
    
    

Train fold names:  ['Fold1_train', 'Fold2_train', 'Fold3_train', 'Fold4_train', 'Fold5_train']

Test fold names:  ['Fold1_test', 'Fold2_test', 'Fold3_test', 'Fold4_test', 'Fold5_test']
* No team overlap *
* No team overlap *
Number of iterations:  2

 Fold1_test
[1059 1035 1080 1086 1044 1039 1050 1038 1037 2036 2049 2039 1020 2035
 2046 2027 1010 1078 1051]
Num groups:  19

 Fold1_train
[2054, 1051, 2029, 1047, 2071, 2046, 1069, 2016, 2070, 2014, 2015, 1035, 2044, 1098, 2018, 2037, 1077, 1039, 2048, 2034, 10104, 1045, 1040, 1046, 1062, 2039, 1096, 2052, 2036, 2033, 1052, 1086, 2069, 1010, 2063, 2062, 10103, 1041, 2068, 2066, 1059, 1061, 1070, 2057, 1054, 1076, 1099, 2056, 2038, 2019, 1078, 1034, 2045, 1075, 1043, 2035, 1067, 1037, 2049, 1071, 1042, 2030, 1056, 2043, 1084, 2020, 1074, 1093, 2027, 2031, 1068, 1020, 1097, 1090]
Num groups:  74

 Fold2_test
[ 1090  2063  1036 10104  1097  2052  2071  1046  2031  2069  2048  2030
  1069  2066  1096  2054  1056  2034  1099]
Num groups:  19


In [None]:
features = ['REC', 'DET', 'ADL', 'MDL', 'DENTR', 'LAM', 'AVL', 'MVL', 'VENTR']

def get_group_data(dfData, dfLabels, GROUPID_list, features_list, label_name):
    data_to_labels = pd.merge(dfData, dfLabels, on=['GROUPID', 'block'], how='inner') 
    
    dfGroupsData = pd.DataFrame()    
    for GROUPID in GROUPID_list:
        dfGroupsData = pd.concat([dfGroupsData, data_to_labels.loc[:, dfData['GROUPID'] == GROUPID]], ignore_index=True)
        
    data = dfGroupsData.loc[:, features_list]  
    labels = dfGroupsData.loc[:, label_name]
    return [data, labels]


# Predict Task Score

In [None]:
# Store AUROCS for all iterations
aurocs = []
# all_test_indices_task_score = [[] for i in range(num_iters)]
all_y_test_task_score = [[] for i in range(num_iters)]
predictions_task_score = [] for i in range(num_iters)]
predict_proba_task_score = [[] for i in range(num_iters)]

# Store feature importances and std dev for all iterations
impurity_importances_task_score = pd.DataFrame(columns=features)
importances_std_task_score = pd.DataFrame(columns=features)

# For storing shap values across all folds
task_score_shap_values_0 = None
task_score_shap_values_1 = None
task_score_full_X_test = pd.DataFrame()

#-----------------------------------------------#
#      5-fold team level cross-validation       #
#-----------------------------------------------#

# For each iteration 
for i in range(num_iters):
    print("Iteration: ", i+1)
    
    # Lists for cumulative test set and predictions for iteration
#     all_test_indices = []
    all_y_test = []
    predictions = []
    predict_proba = []
    
    # Create RandomForestClassifier for task_score prediction
    rfc_task_score = RandomForestClassifier(n_estimators=100, random_state=1, \
                                            max_features='sqrt', class_weight='balanced') 
    
    # Get fold groups
    fold_groups = folds_dict_list[i]
    
    # For each fold
    for j, (test_fold, train_fold) in enumerate(zip(test_folds, train_folds)):
        # Get data for teams in test set
        test_data_list = get_group_data(dfData, dfLabels, fold_groups[test_fold], features, 'task_score')
        X_test = test_data_list[0]
        y_test = test_data_list[1]
        all_y_test.extend(y_test.tolist())
        
        print("X_test shape: ", X_test.shape)
        display(X_test.head())
        print("y_test shape: ", y_test.shape)
        display(y_test.head())


        # Get data for teams in train set
        train_data_list = get_group_data(dfData, dfLabels, fold_groups[train_fold], features, 'task_score')
        X_train = train_data_list[0]
        y_train = train_data_list[1]
        
        print("X_train shape: ", X_train.shape)
        display(X_train.head())
        print("y_train shape: ", y_train.shape)
        display(y_train.head())


        # Train model
        rfc_task_score.fit(X_train, y_train)

        # Test model
        y_pred = rfc_task_score.predict(X_test)
        predictions.extend(y_pred.tolist())

        y_pp = rfc_task_score.predict_proba(X_test)[:, 1]
        predict_proba.extend(y_pp.tolist())
        
        ## Get SHAP values
        explainer = shap.TreeExplainer(rfc_task_score)
        shap_values = explainer.shap_values(X_test)
        
        if j==0:
            COMM_shap_values_0 = shap_values[0]
            COMM_shap_values_1 = shap_values[1]
        else:
            task_score_shap_values_0 = np.vstack([task_score_shap_values_0, shap_values[0]])
            task_score_shap_values_1 = np.vstack([task_score_shap_values_1, shap_values[1]])
            
        task_score_full_X_test = pd.concat([task_score_full_X_test, X_test], ignore_index=True)
        ## End of get SHAP values
        
    
    # Get AUROC of iteration
    auroc = roc_auc_score(all_y_test, predict_proba)
    aurocs.append(auroc)
    print("AUROC: ", auroc)
    print("\n")
    
    # Save iteration task score truth labels, and predictions for micro auroc
#     all_test_indices_COMM[i] = all_test_indices
    all_y_test_task_score[i] = all_y_test
    predictions_task_score[i] = predictions
    predict_proba_task_score[i] = predict_proba
    
    #-----------------------------------------------#
    #   Compute Feature Importances for iteration   #
    #-----------------------------------------------#
    # Impurity-based importances
    importances = rfc_task_score.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rfc_task_score.estimators_], axis=0)
    imps = pd.Series(importances, index=features)
    stds = pd.Series(std, index=features)
    impurity_importances_task_score = pd.concat([impurity_importances_task_score, imps], ignore_index=True)
    importances_std_task_score = pd.concat([importances_std_task_score, stds], ignore_index=True)
    
#     # Plot feature importances
#     fig1, ax1 = plt.subplots()
#     imps.plot.bar(yerr=std, ax=ax1)
#     ax1.set_title("Task Score Feature importances using MDI")
#     ax1.set_ylabel("Mean decrease in impurity")
#     plt.show()

    
# Get mean AUROC over all iterations
avg_auroc = np.mean(aurocs)
print("Mean AUROC over %d iterations: %f" % (i+1, avg_auroc))

# Get median AUROC over all iterations
med_auroc = np.median(aurocs)
task_score_med_auroc_idx = np.argsort(aurocs)[len(aurocs)//2]
print("Median AUROC over %d iterations: %f" % (i+1, med_auroc))
print("Median iteration number: ", task_score_med_auroc_idx+1)
print("All AUROCS: ", aurocs)

# Save median iteration labels and predictions
med_y_test_task_score = all_y_test_task_score[task_score_med_auroc_idx]
med_predictions_task_score = predictions_task_score[task_score_med_auroc_idx]
med_predict_proba_task_score = predict_proba_task_score[task_score_med_auroc_idx]
# med_test_indices_COMM = all_test_indices_COMM[COMM_med_auroc_idx]    