In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, plot_roc_curve


In [109]:
df = pd.read_csv('../20220911_data.csv')
df.drop(columns=['hire_date'], inplace=True)
for var in ('gender', 'ethnicity'):
    temp = pd.get_dummies(df[var], prefix=var, drop_first=True)
    df.drop(columns=[var], inplace=True)
    df = df.join(temp)
    

In [110]:
y = df['terminated_in_first_year']
X = df.drop(columns=['terminated_in_first_year'])


In [111]:
cols = X.columns.str.contains('strengths|weakness|compared_to_others')

# TODO: scale to 0-1
crosschq_vars = X.iloc[:, cols].values
crosschq_vars = MinMaxScaler().fit_transform(crosschq_vars)

pca = PCA(n_components=crosschq_vars.shape[1])
pca.fit(crosschq_vars)

var_explained = pca.explained_variance_ratio_.cumsum()

components = np.argmax(var_explained >= 0.75)

pca75 = PCA(components)
crosschq_vars75 = pca75.fit_transform(crosschq_vars)

X = X.iloc[:, ~cols]
crosschq_vars75 = pd.DataFrame(crosschq_vars75)
crosschq_vars75.columns = [f'pcomponent_{i}' for i in crosschq_vars75.columns]
X = X.join(crosschq_vars75)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [112]:
import torch

In [113]:
X_train, X_test = torch.Tensor(X_train.astype(float).values),torch.Tensor(X_test.astype(float).values)
y_train, y_test = torch.Tensor(y_train.astype(float).values),torch.Tensor(y_test.astype(float).values)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [157]:
input_dim = X.shape[1]
hidden_dim = 16 # param
output_dim = 1
dropout_rate = 0.1 # param
learning_rate = 0.01 # param
momentum = 0.9 # param


class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.cf1 = torch.nn.Linear(input_dim, hidden_dim)
        self.cf2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.cf3 = torch.nn.Linear(hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout_rate)

    def forward(self, x):
        x = torch.sigmoid(self.cf1(x))
        x = torch.sigmoid(self.cf2(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.cf3(x))
        return x
    
model = LogisticRegression(input_dim, output_dim)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)


In [177]:
model.train()
for epoch in range(10000):  # loop over the dataset multiple times

    running_loss = 0.0
    # get the inputs; data is a list of [inputs, labels]
    inputs = X_train
    labels = y_train

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if epoch % 500 == 0:
        print(f'[{epoch:5d}] loss: {(running_loss/500):.3f} ||| test accuracy: {((model(X_test).round() == y_test).sum()/len(y_test)):.3f}')
        running_loss = 0.0

print('Finished Training')

[    0] loss: 0.001 ||| test accuracy: 0.805
[  500] loss: 0.001 ||| test accuracy: 0.827
[ 1000] loss: 0.001 ||| test accuracy: 0.830


KeyboardInterrupt: 

In [178]:
model.eval()

(model(X_train).round() == y_train).sum()/len(y_train)

tensor(0.7953)

In [179]:
(model(X_test).round() == y_test).sum()/len(y_test)

tensor(0.8207)

In [180]:
(model(X_test).round() == y_test).sum()/len(y_test)

tensor(0.8207)

In [181]:
running_loss, loss

(0.0, tensor(0.4675, grad_fn=<BinaryCrossEntropyBackward0>))