In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

In [24]:
datasets = ["Study_A.csv", "Study_B.csv", "Study_C.csv", "Study_D.csv"]
train = pd.concat([pd.read_csv(df) for df in datasets])
test = pd.read_csv("Study_E.csv")
train = train.to_numpy()
test = test.to_numpy()

print(train.shape)
print(test.shape)

(20947, 40)
(1962, 39)


In [25]:
scores = train[:, 7:-1]
testIDs = test[:, 5]
test_scores = test[:, 7:]

toLabel = {"Passed": 0, "Assign to CS": 1, "Flagged": 1}

scores = np.concatenate((train[:,2].reshape((-1, 1)), scores, train[:, -1].reshape((-1, 1))), axis=1)
testPy = np.concatenate((test[:, 2].reshape((-1, 1)), test_scores, test[:, -1].reshape((-1, 1))), axis=1)

print(scores.shape)
print(testPy.shape)


(20947, 34)
(1962, 34)


In [26]:
def featureExtraction(arr):
    final = arr[1]
    final = np.hstack((final, arr[2:-1]))
    return final

def Labels(arr, getLabs=True):
    patient = arr[0, 0]
    labels = [toLabel[x[-1]] for x in arr[1:]] if getLabs else []
    sequences = []
    runningAggregation = featureExtraction(arr[0])

    for i, x in enumerate(arr[1:], start=1):
        if x[0] == patient:
            runningAggregation = np.vstack((runningAggregation, featureExtraction(x)))
        else:
            sequences.append(runningAggregation)
            runningAggregation = featureExtraction(x)
            patient = x[0]

    sequences.append(runningAggregation)

    if getLabs:
        labels.append(toLabel[arr[-1][-1]])

    return sequences, labels


nuSeqs, nuLabels = Labels(scores)
testSeqs, _ = Labels(testPy, False)

print(nuSeqs[0].shape)
print(testSeqs[0].shape)

(7, 32)
(32,)


In [27]:
def process(inp, window_size=1):
    X = []
    for x in inp:
        if x.ndim == 1:
            x = x[np.newaxis, :]
        padded_x = np.pad(x, pad_width=((window_size, window_size), (0, 0)), mode='edge')
        for y in range(window_size, len(x)+window_size):
            seq = padded_x[y-window_size:y+window_size+1].flatten()
            X.append(seq)
    return X

X = process(nuSeqs)
Xtest = process(testSeqs)

y = np.hstack(nuLabels)
X = np.vstack(X)
Xtest = np.vstack(Xtest)

Xflag = X[y == 1]
Xnorm = X[y == 0]

print(Xflag.shape)
print(Xnorm.shape)

(5106, 96)
(15841, 96)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [200, 500, 1000],
    'max_depth': [2, 4, 6, 8]
}

clf = GradientBoostingClassifier()
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

reg = GradientBoostingClassifier(**best_params).fit(X_train, y_train)
y_train_preds = reg.predict(X_train)
y_test_preds = reg.predict(X_test)
train_loss = log_loss(y_train, y_train_preds)
test_loss = log_loss(y_test, y_test_preds)

print('Best Hyperparameters: ', best_params)
print('Train Loss: ', train_loss)
print('Test Loss: ', test_loss)

Best Hyperparameters:  {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 500}
Train Loss:  6.637865629815334
Test Loss:  6.847433913541111


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


reg = GradientBoostingClassifier(learning_rate=0.8, n_estimators=400, max_depth=10).fit(X_train, y_train)
y_train_preds = reg.predict(X_train)
y_test_preds = reg.predict(X_test)
train_loss = log_loss(y_train, y_train_preds)
test_loss = log_loss(y_test, y_test_preds)

print('Best Hyperparameters: ', best_params)
print('Train Loss: ', train_loss)
print('Test Loss: ', test_loss)

Best Hyperparameters:  {'learning_rate': 0.05, 'max_depth': 6, 'max_features': 0.5, 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 200}
Train Loss:  2.220446049250313e-16
Test Loss:  5.8667712676319566


In [31]:
reg = GradientBoostingClassifier(**best_params).fit(X, y)
preds = reg.predict_proba(Xtest)[:, 1]
out = pd.DataFrame({"AssessmentID": testIDs, "LeadStatus": preds})
out.to_csv("classification_final.csv", index=False)