In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, plot_roc_curve


In [282]:
df = pd.read_csv('../20220911_data.csv')
df.drop(columns=['hire_date'], inplace=True)
for var in ('gender', 'ethnicity'):
    temp = pd.get_dummies(df[var], prefix=var, drop_first=True)
    df.drop(columns=[var], inplace=True)
    df = df.join(temp)
    

In [283]:
y = df['terminated_in_first_year']
X = df.drop(columns=['terminated_in_first_year'])


In [284]:
cols = X.columns.str.contains('strengths|weakness|compared_to_others')

# TODO: scale to 0-1
crosschq_vars = X.iloc[:, cols].values
crosschq_vars = MinMaxScaler().fit_transform(crosschq_vars)

pca = PCA(n_components=crosschq_vars.shape[1])
pca.fit(crosschq_vars)

var_explained = pca.explained_variance_ratio_.cumsum()

components = np.argmax(var_explained >= 0.75)

pca75 = PCA(components)
crosschq_vars75 = pca75.fit_transform(crosschq_vars)

X = X.iloc[:, ~cols]
crosschq_vars75 = pd.DataFrame(crosschq_vars75)
crosschq_vars75.columns = [f'pcomponent_{i}' for i in crosschq_vars75.columns]
X = X.join(crosschq_vars75)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [None]:
#####################
### DECISION TREE ###
#####################

In [None]:
# https://scikit-learn.org/stable/modules/tree.html#minimal-cost-complexity-pruning
clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(X_train, y_train)

# clf.score(X_train, y_train)

In [None]:
# https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html
path = clf.cost_complexity_pruning_path(X_train, y_train)
path = pd.DataFrame(path)

sns.lineplot(path, x = 'ccp_alphas', y='impurities')

In [None]:
parameters = {
    'criterion': ('entropy', 'gini'),
    'splitter': ('best', 'random'),
    'ccp_alpha': np.arange(0, 0.04, 0.001),
    'class_weight': ('balanced', None),
#     'max_depth': np.arange(5, 30, 1),
#     'min_samples_split': np.arange(2, 10, 1),
#     'min_samples_leaf': np.arange(1, 5, 1),
}

clf = DecisionTreeClassifier(random_state=0)
cv = RandomizedSearchCV(clf, parameters, n_iter=150)
cv.fit(X_train, y_train)


In [None]:
cv_results = pd.DataFrame(cv.cv_results_)
cv_results.loc[cv_results.param_class_weight != 'balanced', 'param_class_weight'] = 'None'

print(cv.best_params_, cv.best_score_)

best_clf = cv.best_estimator_

sns.lineplot(
    cv_results[cv_results.param_class_weight == 'None'],
    x='param_ccp_alpha', 
    y='mean_test_score',
    hue='param_criterion',
)


In [None]:
print(
    'train score:', best_clf.score(X_train, y_train), '||',
    'test score:', best_clf.score(X_test, y_test),
)

confusion_matrix(best_clf.predict(X_train), y_train)

In [None]:
plot_roc_curve(best_clf, X_train, y_train)

In [None]:
######################
### NEURAL NETWORK ###
######################

In [338]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
# https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss

In [339]:
scalar = MinMaxScaler()
scalar.fit(X)


In [340]:
X_train2 = torch.tensor(scalar.transform(X_train))
X_train2 = X_train2.type(torch.FloatTensor)

y_train2 = torch.tensor(y_train.astype(np.float32).values).reshape(-1, 1)
y_train2 = y_train2.type(torch.FloatTensor)

X_test2 = torch.tensor(scalar.transform(X_test))
X_test2 = X_test2.type(torch.FloatTensor)

y_test2 = torch.tensor(y_test.astype(np.float32).values).reshape(-1, 1)
y_test2 = y_test2.type(torch.FloatTensor)



In [341]:

batch_size = 32
# Create data loaders.
train_dataloader = DataLoader(
    TensorDataset(X_train2, y_train2),
    batch_size=batch_size,
)
test_dataloader = DataLoader(
    TensorDataset(X_test2, y_test2),
    batch_size=batch_size,
)

for X, y in train_dataloader:
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
    break
    

Shape of X: torch.Size([32, 27])
Shape of y: torch.Size([32, 1])


In [342]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1),
#             nn.Linear(X.shape[1], 32),
#             nn.Sigmoid(),
#             nn.Linear(32, 32),
#             nn.Sigmoid(),
#             nn.Linear(32, 1),
#             nn.Sigmoid(),
        )

    def forward(self, x):
#         x = self.flatten(x)
        logits = self.linear_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_stack): Sequential(
    (0): Linear(in_features=27, out_features=1, bias=True)
  )
)


In [343]:
class_weights = 1 - torch.tensor(df['terminated_in_first_year'].mean())
print(class_weights)

tensor(0.7578, dtype=torch.float64)


In [346]:
# loss_fn = nn.CrossEntropyLoss(weight=class_weights)  # need to weight the classes because they're imbalanced
# loss_fn = nn.NLLLoss()
loss_fn = nn.MSELoss()
# loss_fn = nn.BCELoss()
# loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc


In [345]:

X_train2.shape, y_train2.shape

(torch.Size([1314, 27]), torch.Size([1314, 1]))

In [333]:
# pred = model(X_train2)

# loss = loss_fn(pred, y_train2)

# pred, y_train2

# # Backpropagation
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()

In [347]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        
        # Backpropagation
#         optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 20 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [350]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()            
            correct += ((pred > 0.5) == y).type(torch.float).sum().item()
            
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [351]:
epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)

print("Done!")

Epoch 1
-------------------------------
loss: 3205537030209536.000000  [    0/ 1314]
loss: 30424589700258160705536.000000  [  640/ 1314]
loss: 67591330639009372206180335616.000000  [ 1280/ 1314]
Test Error: 
 Accuracy: 20.1%, Avg loss: 645154827424945236768406372352.000000 

Epoch 2
-------------------------------
loss: 568499557851827420920958418944.000000  [    0/ 1314]
loss: 5395784569660563810861652366208270336.000000  [  640/ 1314]
loss:     inf  [ 1280/ 1314]
Test Error: 
 Accuracy: 20.1%, Avg loss:      inf 

Epoch 3
-------------------------------
loss:     inf  [    0/ 1314]
loss:     inf  [  640/ 1314]
loss:     inf  [ 1280/ 1314]
Test Error: 
 Accuracy: 20.1%, Avg loss:      inf 

Epoch 4
-------------------------------
loss:     inf  [    0/ 1314]
loss:     inf  [  640/ 1314]
loss:     inf  [ 1280/ 1314]
Test Error: 
 Accuracy: 20.1%, Avg loss:      inf 

Epoch 5
-------------------------------
loss:     inf  [    0/ 1314]
loss:     nan  [  640/ 1314]
loss:     nan  [ 1280/

Test Error: 
 Accuracy: 79.9%, Avg loss:      nan 

Epoch 63
-------------------------------
loss:     nan  [    0/ 1314]
loss:     nan  [  640/ 1314]
loss:     nan  [ 1280/ 1314]
Test Error: 
 Accuracy: 79.9%, Avg loss:      nan 

Epoch 64
-------------------------------
loss:     nan  [    0/ 1314]
loss:     nan  [  640/ 1314]
loss:     nan  [ 1280/ 1314]
Test Error: 
 Accuracy: 79.9%, Avg loss:      nan 

Epoch 65
-------------------------------
loss:     nan  [    0/ 1314]
loss:     nan  [  640/ 1314]
loss:     nan  [ 1280/ 1314]
Test Error: 
 Accuracy: 79.9%, Avg loss:      nan 

Epoch 66
-------------------------------
loss:     nan  [    0/ 1314]
loss:     nan  [  640/ 1314]
loss:     nan  [ 1280/ 1314]
Test Error: 
 Accuracy: 79.9%, Avg loss:      nan 

Epoch 67
-------------------------------
loss:     nan  [    0/ 1314]
loss:     nan  [  640/ 1314]
loss:     nan  [ 1280/ 1314]
Test Error: 
 Accuracy: 79.9%, Avg loss:      nan 

Epoch 68
-------------------------------
loss:  

In [352]:
(sum(model(X_train2).round() == y_train2)/len(y_train2)).item(), 1 - y_train2.sum()/len(y_train2)

# loss_fn(model(X_train2), y_train2.type(torch.LongTensor))

(0.0, tensor(0.7473))

In [320]:
### baseline logistic regression, training accuracy
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(random_state=0).fit(X_train, y_train)
reg.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8115501519756839