# Aggregate Dataset for PSP



In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

In [2]:
dir_root = '../../data/modified_psp_data/'
dir_out = '../../data/modified_psp_data/'

path_train = os.path.join(dir_root, "df_agg_20230426_001200_01.csv")


In [3]:
def get_null_cols(df_data, num_null=0):
    display(df_data[df_data.isnull().any(axis=1)])
    
    dict_col_nulls = dict(df_data.isnull().sum(axis = 0))
    display(dict_col_nulls)
    
    list_null_cols = []
    for col, count_nulls in dict_col_nulls.items():
        if count_nulls > num_null:
            list_null_cols.append(col)
    
    display(list_null_cols)
    return list_null_cols
        

In [4]:
def read_data(csv_path):
    df_agg = pd.read_csv(csv_path)
    
    list_null_cols = get_null_cols(df_agg)
    
    list_all_cols = list(df_agg.columns)
    list_selected_cols = [x for x in list_all_cols if x not in list_null_cols]
    
    print(f"Removed NULL columns: {len(list_null_cols)}")
    # Alternatively, we could assign a specific value for each column having NULL. 
    # Needs more domain knowledge.
    
    df_agg = df_agg[list_selected_cols]
    
    return df_agg, list_null_cols
    

In [5]:
df_agg, list_null_cols = read_data(path_train)

df_agg.head()

Unnamed: 0,ith_sess,session_id,navigate_click:0-4,cutscene_click:0-4,person_click:0-4,object_click:0-4,notification_click:0-4,observation_click:0-4,object_hover:0-4,map_hover:0-4,...,q13,q14,q15,q16,q17,q18,num_correct,notebook_click:0-4,notebook_click:5-12,notebook_click:13-22
0,0,20090312431273200,81,28,22,11,8,4.0,4.0,4.0,...,0,1,1,0,1,1,16,,,
3,3,20090314363702160,66,32,18,11,6,,8.0,2.0,...,0,1,0,0,1,1,15,,,6.0
4,4,20090314441803444,37,29,18,6,5,,5.0,4.0,...,0,1,1,1,1,1,14,,,
5,5,20090315081004164,78,36,18,35,9,2.0,6.0,1.0,...,1,1,1,1,1,1,14,,2.0,
10,10,20090317111400710,124,29,22,30,11,7.0,9.0,3.0,...,1,1,1,0,1,1,12,,16.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11768,11768,22100212552203824,147,39,34,37,8,,4.0,4.0,...,0,1,0,1,0,0,7,,22.0,60.0
11770,11770,22100213133089136,62,32,18,10,6,,3.0,2.0,...,0,1,1,0,1,1,15,,3.0,2.0
11771,11771,22100215032067016,74,45,21,15,5,,2.0,2.0,...,0,1,1,1,1,1,14,,2.0,
11772,11772,22100215190998610,96,28,22,21,9,,2.0,6.0,...,0,0,0,0,0,0,3,,,53.0


{'ith_sess': 0,
 'session_id': 0,
 'navigate_click:0-4': 0,
 'cutscene_click:0-4': 0,
 'person_click:0-4': 0,
 'object_click:0-4': 0,
 'notification_click:0-4': 0,
 'observation_click:0-4': 3843,
 'object_hover:0-4': 934,
 'map_hover:0-4': 2062,
 'map_click:0-4': 0,
 'checkpoint:0-4': 1,
 'person_click:5-12': 0,
 'navigate_click:5-12': 0,
 'object_click:5-12': 0,
 'object_hover:5-12': 934,
 'cutscene_click:5-12': 0,
 'map_hover:5-12': 934,
 'notification_click:5-12': 0,
 'map_click:5-12': 0,
 'observation_click:5-12': 0,
 'checkpoint:5-12': 0,
 'navigate_click:13-22': 0,
 'person_click:13-22': 0,
 'cutscene_click:13-22': 0,
 'object_click:13-22': 0,
 'map_hover:13-22': 934,
 'object_hover:13-22': 934,
 'notification_click:13-22': 0,
 'map_click:13-22': 0,
 'observation_click:13-22': 0,
 'checkpoint:13-22': 0,
 'elapsed_time:0-4': 1,
 'event_index:0-4': 1,
 'elapsed_time:5-12': 0,
 'event_index:5-12': 0,
 'elapsed_time:13-22': 0,
 'event_index:13-22': 0,
 'q1': 0,
 'q2': 0,
 'q3': 0,
 '

['observation_click:0-4',
 'object_hover:0-4',
 'map_hover:0-4',
 'checkpoint:0-4',
 'object_hover:5-12',
 'map_hover:5-12',
 'map_hover:13-22',
 'object_hover:13-22',
 'elapsed_time:0-4',
 'event_index:0-4',
 'notebook_click:0-4',
 'notebook_click:5-12',
 'notebook_click:13-22']

Removed NULL columns: 13


Unnamed: 0,ith_sess,session_id,navigate_click:0-4,cutscene_click:0-4,person_click:0-4,object_click:0-4,notification_click:0-4,map_click:0-4,person_click:5-12,navigate_click:5-12,...,q10,q11,q12,q13,q14,q15,q16,q17,q18,num_correct
0,0,20090312431273200,81,28,22,11,8,2,104,103,...,1,1,1,0,1,1,0,1,1,16
1,1,20090312433251036,49,36,18,15,5,3,97,115,...,0,0,1,0,1,0,1,0,1,10
2,2,20090314121766812,64,26,19,19,8,2,98,131,...,1,1,1,0,1,1,1,0,1,12
3,3,20090314363702160,66,32,18,11,6,2,114,99,...,1,1,1,0,1,0,0,1,1,15
4,4,20090314441803444,37,29,18,6,5,2,93,76,...,1,0,1,0,1,1,1,1,1,14


In [42]:
list_id = [
    'ith_sess',
    'session_id'
]

list_agg = [
    'navigate_click:0-4',
    'cutscene_click:0-4',
    'person_click:0-4',
    'object_click:0-4',
    'notification_click:0-4',
    'observation_click:0-4',
    'object_hover:0-4',
    'map_hover:0-4',
    'map_click:0-4',
    'checkpoint:0-4',
    'person_click:5-12',
    'navigate_click:5-12',
    'object_click:5-12',
    'object_hover:5-12',
    'cutscene_click:5-12',
    'map_hover:5-12',
    'notification_click:5-12',
    'map_click:5-12',
    'observation_click:5-12',
    'checkpoint:5-12',
    'navigate_click:13-22',
    'person_click:13-22',
    'cutscene_click:13-22',
    'object_click:13-22',
    'map_hover:13-22',
    'object_hover:13-22',
    'notification_click:13-22',
    'map_click:13-22',
    'observation_click:13-22',
    'checkpoint:13-22',
    'notebook_click:0-4',
    'notebook_click:5-12',
    'notebook_click:13-22'
]
list_agg.sort()

list_checkpoint_agg = [
    'elapsed_time:0-4',
    'event_index:0-4',
    'elapsed_time:5-12',
    'event_index:5-12',
    'elapsed_time:13-22',
    'event_index:13-22'
]
list_checkpoint_agg.sort()

list_q = [f"q{x}" for x in range(1, 19)]

list_num_correct = ['num_correct']


list_agg = [x for x in list_agg if x not in list_null_cols]
list_checkpoint_agg = [x for x in list_checkpoint_agg if x not in list_null_cols]

list_cols = list_id + list_agg + list_checkpoint_agg + list_q + list_num_correct


In [43]:
len(list_cols)

47

In [17]:
df_agg = df_agg[list_cols]

display(df_agg.shape)

df_agg.head()


(11779, 47)

Unnamed: 0,ith_sess,session_id,checkpoint:13-22,checkpoint:5-12,cutscene_click:0-4,cutscene_click:13-22,cutscene_click:5-12,map_click:0-4,map_click:13-22,map_click:5-12,...,q10,q11,q12,q13,q14,q15,q16,q17,q18,num_correct
0,0,20090312431273200,1,1,28,60,12,2,6,8,...,1,1,1,0,1,1,0,1,1,16
1,1,20090312433251036,1,1,36,65,11,3,45,16,...,0,0,1,0,1,0,1,0,1,10
2,2,20090314121766812,1,1,26,58,14,2,7,9,...,1,1,1,0,1,1,1,0,1,12
3,3,20090314363702160,1,1,32,76,11,2,12,7,...,1,1,1,0,1,0,0,1,1,15
4,4,20090314441803444,1,1,29,57,11,2,6,6,...,1,0,1,0,1,1,1,1,1,14


# Split into train and test data

In [18]:
import random

def create_train_test_splits(dir_out, df_data, test_split=0.2):
    list_unique_sessions = list(df_data['session_id'].unique())
    print(f"list_unique_sessions: {len(list_unique_sessions)}")
    
    len_data = len(list_unique_sessions)
    len_test = int(test_split * len_data)
    
    list_random_sessions = random.sample(list_unique_sessions, len_data)
    
    list_train_sessions = list_random_sessions[:-len_test]
    list_test_sessions = list_random_sessions[-len_test:]
    
    df_train = pd.DataFrame()
    df_train["session_id"] = list_train_sessions
    df_train["split_type"] = "train"
    
    df_test = pd.DataFrame()
    df_test["session_id"] = list_test_sessions
    df_test["split_type"] = "test"
    
    print(f"Train split: {df_train.shape}, Test split: {df_test.shape}")
    
    df_train_test_splits = pd.concat([df_train, df_test], axis=0)
    print(f"Combined len: {df_train_test_splits.shape}")
    
    split_out_path = os.path.join(dir_out, "df_train_test_splits.csv")
    
    df_train_test_splits.to_csv(split_out_path, index=False)
    print(f"Saved df_train_test_splits at: {split_out_path}")
    
    return df_train_test_splits
    

    
def get_train_test_splits_splits(df_agg):
    print(f"Combined len: {df_agg.shape}")
    
    split_out_path = os.path.join(dir_out, "df_train_test_splits.csv")
    df_train_test_splits = pd.read_csv(split_out_path)
    
    list_train_sessions = list(df_train_test_splits[df_train_test_splits["split_type"] == "train"]['session_id'])
    list_test_sessions = list(df_train_test_splits[df_train_test_splits["split_type"] == "test"]['session_id'])
 
    df_agg_train = df_agg[df_agg['session_id'].isin(list_train_sessions)].copy()
    df_agg_test = df_agg[df_agg['session_id'].isin(list_test_sessions)].copy()
    
    print(f"Train split: {df_agg_train.shape}, Test split: {df_agg_test.shape}")
    
    
    return df_agg_train, df_agg_test
    


In [19]:
# df_1 = create_train_test_splits(dir_out, df_agg)

df_agg_train, df_agg_test = get_train_test_splits_splits(df_agg)


Combined len: (11779, 47)
Train split: (9424, 47), Test split: (2355, 47)


# Statistical Analysis

Reference:
1. https://www.statology.org/statsmodels-linear-regression-p-value/
2. https://www.statsmodels.org/stable/api.html

In [20]:
# !pip install statsmodels

In [21]:
import statsmodels.api as sm


In [28]:
# df_temp = df_agg_train.head(100)
df_temp = df_agg_train.copy()

list_cols_x = list_agg + list_checkpoint_agg
list_cols_y = list_q
col_y = list_cols_y[-1]


In [29]:

# define predictor and response variables
y = df_temp[col_y]
x = df_temp[list_cols_x]

# add constant to predictor variables
x = sm.add_constant(x)

# fit linear regression model
model = sm.OLS(y, x).fit()

# view model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            num_correct   R-squared:                       0.296
Model:                            OLS   Adj. R-squared:                  0.294
Method:                 Least Squares   F-statistic:                     151.9
Date:                Wed, 26 Apr 2023   Prob (F-statistic):               0.00
Time:                        15:41:07   Log-Likelihood:                -22382.
No. Observations:                9424   AIC:                         4.482e+04
Df Residuals:                    9397   BIC:                         4.501e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [30]:

# define predictor and response variables
y = df_temp[col_y]
x = df_temp[list_cols_x]

# add constant to predictor variables
x = sm.add_constant(x)

# fit linear regression model
model = sm.GLM(y, x).fit()

# view model summary
print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:            num_correct   No. Observations:                 9424
Model:                            GLM   Df Residuals:                     9397
Model Family:                Gaussian   Df Model:                           26
Link Function:               identity   Scale:                          6.7875
Method:                          IRLS   Log-Likelihood:                -22382.
Date:                Wed, 26 Apr 2023   Deviance:                       63782.
Time:                        15:59:04   Pearson chi2:                 6.38e+04
No. Iterations:                     3   Pseudo R-squ. (CS):             0.3424
Covariance Type:            nonrobust                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [36]:
def get_list_significant_features(model):
    dict_pvals = dict(model.pvalues)
    display(dict_pvals)

    # Set significance level to reject NULL Hypothesis
    significance_level = 0.05
    significant_vars = {var: pval for var, pval in dict_pvals.items() if pval <= significance_level}

    display(significant_vars)

    list_significant_features = [var for var, pval in dict_pvals.items() if pval <= significance_level and var != 'const']
    
    return list_significant_features


In [37]:
# Use only pvalue based selected features
list_significant_features = get_list_significant_features(model)

# define predictor and response variables
y = df_temp[col_y]
x = df_temp[list_significant_features]

# add constant to predictor variables
x = sm.add_constant(x)

# fit linear regression model
model_sig = sm.GLM(y, x).fit()

# view model summary
print(model_sig.summary())

{'const': 8.36039767106188e-118,
 'checkpoint:13-22': 0.11018513256599263,
 'checkpoint:5-12': 0.00197183323009216,
 'cutscene_click:0-4': 8.283730792546626e-08,
 'cutscene_click:13-22': 4.461391483652825e-57,
 'cutscene_click:5-12': 0.6122578979854034,
 'map_click:0-4': 7.302012726292696e-27,
 'map_click:13-22': 3.48317372970659e-20,
 'map_click:5-12': 0.49423227967568295,
 'navigate_click:0-4': 0.004914382333259275,
 'navigate_click:13-22': 0.00010561815181218273,
 'navigate_click:5-12': 0.031193272458366325,
 'notification_click:0-4': 3.127094744760756e-13,
 'notification_click:13-22': 0.0004998655828829641,
 'notification_click:5-12': 7.461671198253791e-05,
 'object_click:0-4': 1.1847383677223368e-08,
 'object_click:13-22': 0.0021357586904013092,
 'object_click:5-12': 8.615878113886844e-07,
 'observation_click:13-22': 2.4124434833445135e-05,
 'observation_click:5-12': 6.732589835725251e-06,
 'person_click:0-4': 0.0001163872612828369,
 'person_click:13-22': 6.577066155704464e-16,
 '

{'const': 8.36039767106188e-118,
 'checkpoint:5-12': 0.00197183323009216,
 'cutscene_click:0-4': 8.283730792546626e-08,
 'cutscene_click:13-22': 4.461391483652825e-57,
 'map_click:0-4': 7.302012726292696e-27,
 'map_click:13-22': 3.48317372970659e-20,
 'navigate_click:0-4': 0.004914382333259275,
 'navigate_click:13-22': 0.00010561815181218273,
 'navigate_click:5-12': 0.031193272458366325,
 'notification_click:0-4': 3.127094744760756e-13,
 'notification_click:13-22': 0.0004998655828829641,
 'notification_click:5-12': 7.461671198253791e-05,
 'object_click:0-4': 1.1847383677223368e-08,
 'object_click:13-22': 0.0021357586904013092,
 'object_click:5-12': 8.615878113886844e-07,
 'observation_click:13-22': 2.4124434833445135e-05,
 'observation_click:5-12': 6.732589835725251e-06,
 'person_click:0-4': 0.0001163872612828369,
 'person_click:13-22': 6.577066155704464e-16,
 'person_click:5-12': 3.9498561910923746e-22,
 'event_index:13-22': 0.000436103846997721,
 'event_index:5-12': 0.004798302269949

                 Generalized Linear Model Regression Results                  
Dep. Variable:            num_correct   No. Observations:                 9424
Model:                            GLM   Df Residuals:                     9402
Model Family:                Gaussian   Df Model:                           21
Link Function:               identity   Scale:                          6.7866
Method:                          IRLS   Log-Likelihood:                -22384.
Date:                Wed, 26 Apr 2023   Deviance:                       63808.
Time:                        16:09:26   Pearson chi2:                 6.38e+04
No. Iterations:                     3   Pseudo R-squ. (CS):             0.3422
Covariance Type:            nonrobust                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [39]:
x

Unnamed: 0,const,checkpoint:5-12,cutscene_click:0-4,cutscene_click:13-22,map_click:0-4,map_click:13-22,navigate_click:0-4,navigate_click:13-22,navigate_click:5-12,notification_click:0-4,...,object_click:0-4,object_click:13-22,object_click:5-12,observation_click:13-22,observation_click:5-12,person_click:0-4,person_click:13-22,person_click:5-12,event_index:13-22,event_index:5-12
0,1.0,1,28,60,2,6,81,170,103,8,...,11,20,28,3,1,22,123,104,931,470
1,1.0,1,36,65,3,45,49,637,115,5,...,15,83,74,5,3,18,145,97,1875,544
2,1.0,1,26,58,2,7,64,190,131,8,...,19,21,44,6,2,19,104,98,1082,557
4,1.0,1,29,57,2,6,37,158,76,5,...,6,41,27,4,1,18,102,93,832,371
5,1.0,1,36,65,2,22,78,514,184,9,...,35,53,31,7,4,18,110,103,1584,640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11772,1.0,1,28,58,3,57,96,882,312,9,...,21,41,112,10,4,22,141,111,2341,872
11773,1.0,1,28,58,2,6,116,239,140,8,...,15,31,48,5,2,19,114,101,1114,567
11774,1.0,1,29,59,2,7,58,169,81,9,...,13,22,42,4,3,21,109,99,913,455
11775,1.0,1,31,70,2,13,50,148,103,9,...,19,13,51,5,2,21,135,130,1021,533


In [41]:
df_test_x = df_agg_test[list_significant_features].copy()
df_test_x['const'] = 1.0

df_test_y = df_agg_test[col_y]

preds = model_sig.predict(df_test_x)

preds

3        73.043915
7        68.109828
21       62.932052
25       71.738126
35       63.671905
           ...    
11765    82.807764
11766    80.240737
11768    59.572344
11776    52.205970
11778    50.827758
Length: 2355, dtype: float64

- The Linear model do not work as expected in this case because this is a highly non-linear problem.

# Multilabel classification problem

References:
1. https://machinelearningmastery.com/multi-label-classification-with-deep-learning/

In summary, to configure a neural network model for multi-label classification, the specifics are:

- Number of nodes in the output layer matches the number of labels.
- Sigmoid activation for each node in the output layer.
- Binary cross-entropy loss function.

In [44]:

# mlp for multi-label classification
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense




In [None]:

# get the dataset
# def get_dataset():
#     X, y = make_multilabel_classification(n_samples=1000, n_features=10, n_classes=3, n_labels=2, random_state=1)
#     return X, y

def get_dataset(df_data, list_features, list_target):
    npa_x = df_data[list_features].to_numpy()
    npa_y = df_data[list_target].to_numpy()
    return npa_x, npa_y

In [52]:
list_features = list_agg + list_checkpoint_agg
list_target = list_q

npa_x, npa_y = get_dataset(df_agg_train, 
                           list_features,
                           list_target)


In [71]:
n_inputs = npa_x.shape[1]
n_outputs = npa_y.shape[1]

print(f"n_inputs: {n_inputs}, n_outputs: {n_outputs}")

n_inputs: 26, n_outputs: 18


In [56]:
npa_x.shape

(9424, 26)

In [60]:
npa_test_x, npa_test_y = get_dataset(df_agg_test, 
                           list_features,
                           list_target)

In [62]:
npa_test_y.shape

(2355, 18)

In [73]:

 
# Multilayer Perceptron
from tensorflow.keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
    
# get the model
def get_model(num_inputs, num_outputs, num_hidden_units=32):
    layer_input = Input(shape=(num_inputs,))
    layer_hidden_01 = Dense(num_hidden_units, 
                            kernel_initializer='he_uniform', 
                            activation='relu'
                           )(layer_input)
    
    layer_hidden_02 = Dense(num_hidden_units//2, 
                            kernel_initializer='he_uniform', 
                            activation='relu'
                           )(layer_hidden_01)
    
    layer_output = Dense(num_outputs, 
                         activation='sigmoid'
                        )(layer_hidden_02)
    
    model = Model(inputs=layer_input, 
                  outputs=layer_output)
    
    
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam')
    
#     model = Sequential()
    
#     model.add(Dense(num_hidden_units, 
#                     input_dim=n_inputs, 
#                     kernel_initializer='he_uniform', 
#                     activation='relu')
#              )
    
#     model.add(Dense(num_hidden_units, 
#                     input_dim=n_inputs, 
#                     kernel_initializer='he_uniform', 
#                     activation='relu')
#              )
    
#     model.add(Dense(n_outputs, 
#                     activation='sigmoid')
#              )
    
#     model.compile(loss='binary_crossentropy', 
#                   optimizer='adam')
    
    return model

In [93]:
def calc_accuracies(test_y, pred_y, num_questions=18):
    # Overall accuracy
    overall_acc = round(accuracy_score(test_y, pred_y)*100, 3)
    print(f"Overall Accuracy: {overall_acc}")
    
    num_records = test_y[:, 0].shape[0]
    dict_class_accuracies = {}
    total_acc = 0.0
    for ith in range(num_questions):
        dict_record = {}
        tmp_test_y = test_y[:, ith]
        tmp_pred_y = pred_y[:, ith]
        tmp_acc = round(accuracy_score(tmp_test_y, tmp_pred_y)*100, 3)
        
        dict_record[f"correct_{ith}_test_y"] = sum(tmp_test_y)
        dict_record[f"incorrect_{ith}_test_y"] = num_records - sum(tmp_test_y)
        
        dict_record[f"correct_{ith}_pred_y"] = sum(tmp_pred_y)
        dict_record[f"incorrect_{ith}_pred_y"] = num_records - sum(tmp_pred_y)
        
        dict_record[f"acc_{ith}_test_y"] = tmp_acc
        
        dict_class_accuracies[ith] = dict_record
        total_acc += tmp_acc
    
    display(dict_class_accuracies)
    
    avg_acc = round(total_acc / num_questions, 3)
    print(f"Average Accuracy: {avg_acc}")
    
    return dict_class_accuracies

In [94]:
npa_test_y

array([[1, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [0, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 0, 0],
       [1, 1, 1, ..., 0, 1, 1],
       [0, 1, 0, ..., 0, 1, 1]])

In [83]:
yhat[:, 0].shape

(2355,)

In [98]:
model = get_model(n_inputs, 
                  n_outputs,
                  num_hidden_units=32)
model.summary()

# model_img_path = os.path.join(dir_out, 'multilayer_perceptron_graph.png')
# plot_model(model, to_file=model_img_path)

# fit model
model.fit(npa_x, 
          npa_y, 
          verbose=1, 
          epochs=2)

# make a prediction on the test set
yhat = model.predict(npa_test_x)

# round probabilities to class labels
yhat = yhat.round()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 26)]              0         
                                                                 
 dense_32 (Dense)            (None, 32)                864       
                                                                 
 dense_33 (Dense)            (None, 16)                528       
                                                                 
 dense_34 (Dense)            (None, 18)                306       
                                                                 
Total params: 1,698
Trainable params: 1,698
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2


In [99]:

dict_acc = calc_accuracies(npa_test_y, yhat)

Overall Accuracy: 0.0


{0: {'correct_0_test_y': 1714,
  'incorrect_0_test_y': 641,
  'correct_0_pred_y': 633.0,
  'incorrect_0_pred_y': 1722.0,
  'acc_0_test_y': 36.858},
 1: {'correct_1_test_y': 2309,
  'incorrect_1_test_y': 46,
  'correct_1_pred_y': 1697.0,
  'incorrect_1_pred_y': 658.0,
  'acc_1_test_y': 71.125},
 2: {'correct_2_test_y': 2186,
  'incorrect_2_test_y': 169,
  'correct_2_pred_y': 2355.0,
  'incorrect_2_pred_y': 0.0,
  'acc_2_test_y': 92.824},
 3: {'correct_3_test_y': 1883,
  'incorrect_3_test_y': 472,
  'correct_3_pred_y': 2350.0,
  'incorrect_3_pred_y': 5.0,
  'acc_3_test_y': 79.915},
 4: {'correct_4_test_y': 1281,
  'incorrect_4_test_y': 1074,
  'correct_4_pred_y': 3.0,
  'incorrect_4_pred_y': 2352.0,
  'acc_4_test_y': 45.478},
 5: {'correct_5_test_y': 1838,
  'incorrect_5_test_y': 517,
  'correct_5_pred_y': 2355.0,
  'incorrect_5_pred_y': 0.0,
  'acc_5_test_y': 78.047},
 6: {'correct_6_test_y': 1735,
  'incorrect_6_test_y': 620,
  'correct_6_pred_y': 0.0,
  'incorrect_6_pred_y': 2355.0,
 

Average Accuracy: 61.885


# Random Forest Classifier

References:
1. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [103]:
from sklearn.ensemble import RandomForestClassifier

In [111]:
def train_rfc(train_x, train_y, max_depth=5, random_state=0):
    rfc = RandomForestClassifier(max_depth=max_depth, 
                                 random_state=random_state)
    rfc.fit(train_x, train_y)
    return rfc


def calc_dict_metrics(tmp_test_y, tmp_pred_y):
    num_records = tmp_test_y.shape[0]
    dict_metrics = {}
    
    tmp_acc = round(accuracy_score(tmp_test_y, tmp_pred_y)*100, 3)
    
    dict_metrics[f"correct_test_y"] = sum(tmp_test_y)
    dict_metrics[f"incorrect_test_y"] = num_records - sum(tmp_test_y)

    dict_metrics[f"correct_pred_y"] = sum(tmp_pred_y)
    dict_metrics[f"incorrect_pred_y"] = num_records - sum(tmp_pred_y)

    dict_metrics[f"acc_test"] = tmp_acc
    return dict_metrics


def eval_clf(clf, test_x, test_y):
    pred_y = clf.predict(test_x)
    dict_metrics = calc_dict_metrics(test_y, pred_y)
    return dict_metrics
    
def get_trainer(classifier_name):
    

def train_models(classifier_name, train_x, train_y, test_x, test_y):
    num_classifiers = train_y.shape[1]
    dict_classifiers = {}
    dict_metrics = {}
    
    # Train a separate binary classifier for each question number
    for ith_clf in range(num_classifiers):
        print(f"Training: {ith_clf+1} of {num_classifiers}")
        dict_tmp = {}
        tmp_train_y = train_y[:, ith_clf]
        dict_tmp[classifier_name] = train_rfc(train_x, tmp_train_y)
        dict_classifiers[ith_clf] = dict_tmp
        
    # Evaluate for each question number
    for ith_clf in range(num_classifiers):
        print(f"Evaluating: {ith_clf+1} of {num_classifiers}")
        dict_tmp = {}
        clf = dict_classifiers[ith_clf][classifier_name]
        tmp_test_y = test_y[:, ith_clf]
        dict_tmp[classifier_name] = eval_clf(clf, test_x, tmp_test_y)
        dict_metrics[ith_clf] = dict_tmp
        
    display(dict_metrics)
    return dict_classifiers, dict_metrics

In [112]:
dict_classifiers, dict_metrics = train_models('rfc', npa_x, npa_y, npa_test_x, npa_test_y)

Training: 0 of 18
Training: 1 of 18
Training: 2 of 18
Training: 3 of 18
Training: 4 of 18
Training: 5 of 18
Training: 6 of 18
Training: 7 of 18
Training: 8 of 18
Training: 9 of 18
Training: 10 of 18
Training: 11 of 18
Training: 12 of 18
Training: 13 of 18
Training: 14 of 18
Training: 15 of 18
Training: 16 of 18
Training: 17 of 18
Evaluating: 0 of 18
Evaluating: 1 of 18
Evaluating: 2 of 18
Evaluating: 3 of 18
Evaluating: 4 of 18
Evaluating: 5 of 18
Evaluating: 6 of 18
Evaluating: 7 of 18
Evaluating: 8 of 18
Evaluating: 9 of 18
Evaluating: 10 of 18
Evaluating: 11 of 18
Evaluating: 12 of 18
Evaluating: 13 of 18
Evaluating: 14 of 18
Evaluating: 15 of 18
Evaluating: 16 of 18
Evaluating: 17 of 18


{0: {'rfc': {'correct_test_y': 1714,
   'incorrect_test_y': 641,
   'correct_pred_y': 2244,
   'incorrect_pred_y': 111,
   'acc_test': 74.183}},
 1: {'rfc': {'correct_test_y': 2309,
   'incorrect_test_y': 46,
   'correct_pred_y': 2355,
   'incorrect_pred_y': 0,
   'acc_test': 98.047}},
 2: {'rfc': {'correct_test_y': 2186,
   'incorrect_test_y': 169,
   'correct_pred_y': 2355,
   'incorrect_pred_y': 0,
   'acc_test': 92.824}},
 3: {'rfc': {'correct_test_y': 1883,
   'incorrect_test_y': 472,
   'correct_pred_y': 2297,
   'incorrect_pred_y': 58,
   'acc_test': 81.146}},
 4: {'rfc': {'correct_test_y': 1281,
   'incorrect_test_y': 1074,
   'correct_pred_y': 1573,
   'incorrect_pred_y': 782,
   'acc_test': 63.397}},
 5: {'rfc': {'correct_test_y': 1838,
   'incorrect_test_y': 517,
   'correct_pred_y': 2270,
   'incorrect_pred_y': 85,
   'acc_test': 78.344}},
 6: {'rfc': {'correct_test_y': 1735,
   'incorrect_test_y': 620,
   'correct_pred_y': 2262,
   'incorrect_pred_y': 93,
   'acc_test': 74

# Imbalanced Classification

References:
1. https://machinelearningmastery.com/framework-for-imbalanced-classification-projects/
2. https://www.analyticsvidhya.com/blog/2020/07/10-techniques-to-deal-with-class-imbalance-in-machine-learning/

# TensorFlow based imbalanced dataset

References:
1. https://www.tensorflow.org/tutorials/structured_data/imbalanced_data