In [1]:
from csr import CSRDataset, CSRGridSearcher, CSRModel

In [2]:
def mean(l):
    return sum(l) / len(l)

def std(l):
    m = mean(l)
    return mean([(x - m) ** 2 for x in l]) ** (1 / 2)

def ptp(p):
    return f'{round(100 * p, 3)}%'

In [3]:
dataset = CSRDataset('csr.avro')

In [4]:
n_splits = 5
default_params = {'eta': .05, 'objective': 'multi:softproba', 'num_class': 3, 'eval_metric': ['auc'], 'tree_method': 'gpu_hist', 'n_estimators': 1000, 'early_stopping_rounds': 40}

# each parameter has a list that will be combined with the other parameters to search through every possible combination of parameters

grid = {'lambda': [20],
        'subsample': [.5],
        'gamma': [1],
        'colsample_bytree': [.9],
        'colsample_bylevel': [.6],
        'min_child_weight': [0],
        'max_depth': [2]}

grid_searcher = CSRGridSearcher(default_params, grid, dataset)
grid_searcher.grid_search(n_splits)

Searching through 1 parameter sets
Evaluating parameter set 1
Done searching


In [5]:
grid_searcher.best_params()

{'lambda': 20,
 'subsample': 0.5,
 'gamma': 1,
 'colsample_bytree': 0.9,
 'colsample_bylevel': 0.6,
 'min_child_weight': 0,
 'max_depth': 2}

In [6]:
grid_searcher.param_metrics('max_depth')

{2: (0.6979063284477632, 0.01385193109253965)}

In [7]:
# train 5 models and combine them for consistent results
model = CSRModel(dataset, 5, ensemble=True)

model.load('website/model')

# data_X, data_Y = model.train(default_params | grid_searcher.best_params())
# model.save('website/model')

In [8]:
# training as ensemble leaves data_X and data_Y as None, so get model performance on the entire dataset
# the model already trained on the dataset so the results here will be better than what they actually are
data_X, data_Y = dataset.separate_input_target(dataset.db_index())

proba = model.predict_proba(data_X)
correct_class_scores = [proba[i][data_Y.iloc[i]] for i in range(len(data_Y))]
print(f'Average score assigned to correct class: {ptp(mean(correct_class_scores))}')
print(f'Average std of score assigned to correct class: {ptp(std(correct_class_scores))}')

pred = model.predict(data_X)
print(f'Accuracy: {ptp(mean([pred[i] == data_Y.iloc[i] for i in range(len(data_Y))]))}')

T_idx = 2

P = pred[data_Y == T_idx]
TP = (P == T_idx).sum()
TPR = TP / len(P)
print(f'Sensitivity / TPR: {ptp(TPR)}')

N = pred[data_Y != T_idx]
TN = (N != T_idx).sum()
TNR = TN / len(N)
print(f'Specificity / TNR: {ptp(TNR)}')

# bayes' theorem
P_pos = sum(data_Y == T_idx) / len(data_Y)
P_pos_pred = (TP + len(N) - TN) / (len(P) + len(N))
print(f'P(approval|predicts approval): {ptp(TPR * P_pos / P_pos_pred)}')
print(f'P(pending/denied|predicts pending/denied): {ptp(TNR * (1 - P_pos) / (1 - P_pos_pred))}')

Average score assigned to correct class: 52.718%
Average std of score assigned to correct class: 16.039%
Accuracy: 68.434%
Sensitivity / TPR: 85.971%
Specificity / TNR: 44.176%
P(approval|predicts approval): 71.005%
P(pending/denied|predicts pending/denied): 66.446%
