In [110]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [111]:
data = pd.read_csv('./data/HR.csv')

In [112]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,part,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [113]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   part                   14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [114]:
data['left'].value_counts()

left
0    11428
1     3571
Name: count, dtype: int64

In [115]:
data['part'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [116]:
#for discrete data, to convert them to numeric value:
#1. directly assign a numerica value
#2. one-hot
data = data.join(pd.get_dummies(data.part).astype(int)).join(pd.get_dummies(data.salary).astype(int)).drop(columns = ['part', 'salary'])

In [117]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,IT,RandD,...,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [118]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   IT                     14999 non-null  int64  
 9   RandD                  14999 non-null  int64  
 10  accounting             14999 non-null  int64  
 11  hr                     14999 non-null  int64  
 12  management             14999 non-null  int64  
 13  marketing              14999 non-null  int64  
 14  product_mng            14999 non-null  int64  
 15  sa

In [119]:
data.left.value_counts()

left
0    11428
1     3571
Name: count, dtype: int64

In [120]:
X = torch.from_numpy(data.drop(columns = ['left']).values).type(torch.FloatTensor)
Y = torch.from_numpy(data.left.values.reshape(-1, 1)).type(torch.FloatTensor)

In [121]:
X.shape

torch.Size([14999, 20])

In [122]:
Y.shape

torch.Size([14999, 1])

## Use child class to create Pytorch model

In [123]:
from torch import nn

In [124]:
class HRModel(nn.Module):
    def __init__(self):
        #call the parent class
        super().__init__()
        ## define needed attributed in the neuron network
        self.lin1 = nn.Linear(20, 64)
        self.lin2 = nn.Linear(64, 64)
        self.lin3 = nn.Linear(64, 1)
        self.activate = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input):
        #define the forward propagation
        x = self.lin1(input)
        x = self.lin2(x)
        x = self.lin3(x)
        x = self.activate(x)
        x = self.sigmoid(x)
        return x

In [125]:
lr = 0.001

In [126]:
#define the optimizing process
def get_model():
    model = HRModel()
    return model, torch.optim.Adam(model.parameters(), lr = lr)

In [127]:
#define the loss function
loss_fn = nn.BCELoss()

In [128]:
model, opt = get_model()

In [129]:
batch_size =  64
steps = len(data)//batch_size
epochs = 100

In [130]:
steps

234

In [131]:
#train
for epoch in range(epochs):
    for batch in range(steps):
        start = batch*batch_size
        end = start+batch_size
        x = X[start:end]
        y = Y[start:end]
        y_pred = model(x)
        loss = loss_fn(y_pred,y)
        opt.zero_grad()
        loss.backward()
        opt.step()
    print(f'Epoch: {epoch}, loss: {loss}.')
        

Epoch: 0, loss: 0.6931471824645996.
Epoch: 1, loss: 0.6931471824645996.
Epoch: 2, loss: 0.6931471824645996.
Epoch: 3, loss: 0.6931471824645996.
Epoch: 4, loss: 0.6931471824645996.
Epoch: 5, loss: 0.6931471824645996.
Epoch: 6, loss: 0.6931471824645996.
Epoch: 7, loss: 0.6931471824645996.
Epoch: 8, loss: 0.6931471824645996.
Epoch: 9, loss: 0.6931471824645996.
Epoch: 10, loss: 0.6931471824645996.
Epoch: 11, loss: 0.6931471824645996.
Epoch: 12, loss: 0.6931471824645996.
Epoch: 13, loss: 0.6931471824645996.
Epoch: 14, loss: 0.6931471824645996.
Epoch: 15, loss: 0.6931471824645996.
Epoch: 16, loss: 0.6931471824645996.
Epoch: 17, loss: 0.6931471824645996.
Epoch: 18, loss: 0.6931471824645996.
Epoch: 19, loss: 0.6931471824645996.
Epoch: 20, loss: 0.6931471824645996.
Epoch: 21, loss: 0.6931471824645996.
Epoch: 22, loss: 0.6931471824645996.
Epoch: 23, loss: 0.6931471824645996.
Epoch: 24, loss: 0.6931471824645996.
Epoch: 25, loss: 0.6931471824645996.
Epoch: 26, loss: 0.6931471824645996.
Epoch: 27, 

In [132]:
#accuracy
((model(X).data.numpy()>0.5) ==Y.numpy()).mean()

np.float64(0.7619174611640777)

## Use Dataset to restructure data

### pytorch has a Dataset class, it can convert any object with `__len__` and `__getitem__`
#### `__len__` means `len()`, e.g. `len(a)`
#### `__getitem__` means slicing with `[]`, e.g. `a[i:]`
#### Dataset will automatically output features and targets together.

In [133]:
from torch.utils.data import TensorDataset

In [134]:
X.shape

torch.Size([14999, 20])

In [135]:
Y.shape

torch.Size([14999, 1])

In [136]:
HRdataset = TensorDataset(X, Y)

In [137]:
len(HRdataset)

14999

In [71]:
model, opt = get_model()

In [138]:
#train with dataset
for epoch in range(epochs):
    for i in range(steps):
        x, y = HRdataset[i*batch_size: (i+1)*batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred,y)
        opt.zero_grad()
        loss.backward()
        opt.step()
    print(f'Epoch: {epoch}, loss: {loss}.')

Epoch: 0, loss: 0.6931471824645996.
Epoch: 1, loss: 0.6931471824645996.
Epoch: 2, loss: 0.6931471824645996.
Epoch: 3, loss: 0.6931471824645996.
Epoch: 4, loss: 0.6931471824645996.
Epoch: 5, loss: 0.6931471824645996.
Epoch: 6, loss: 0.6931471824645996.
Epoch: 7, loss: 0.6931471824645996.
Epoch: 8, loss: 0.6931471824645996.
Epoch: 9, loss: 0.6931471824645996.
Epoch: 10, loss: 0.6931471824645996.
Epoch: 11, loss: 0.6931471824645996.
Epoch: 12, loss: 0.6931471824645996.
Epoch: 13, loss: 0.6931471824645996.
Epoch: 14, loss: 0.6931471824645996.
Epoch: 15, loss: 0.6931471824645996.
Epoch: 16, loss: 0.6931471824645996.
Epoch: 17, loss: 0.6931471824645996.
Epoch: 18, loss: 0.6931471824645996.
Epoch: 19, loss: 0.6931471824645996.
Epoch: 20, loss: 0.6931471824645996.
Epoch: 21, loss: 0.6931471824645996.
Epoch: 22, loss: 0.6931471824645996.
Epoch: 23, loss: 0.6931471824645996.
Epoch: 24, loss: 0.6931471824645996.
Epoch: 25, loss: 0.6931471824645996.
Epoch: 26, loss: 0.6931471824645996.
Epoch: 27, 

## User DataLoader to restructure data

#### DataLoader will automatically slice the data and ouput the data in batches.

In [139]:
from torch.utils.data import DataLoader

In [141]:
HR_ds = TensorDataset(X, Y)
HR_dl = DataLoader(HR_ds, batch_size = batch_size)

In [142]:
for x, y in HR_dl:
    print(x,y)

tensor([[0.3800, 0.5300, 2.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.8000, 0.8600, 5.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.1100, 0.8800, 7.0000,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.1100, 0.9300, 7.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.1000, 0.9500, 6.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.3600, 0.5600, 2.0000,  ..., 0.0000, 0.0000, 1.0000]]) tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],


In [143]:
model, opt = get_model()

In [144]:
for epoch in range(epochs):
    for x, y in HR_dl: #loop by batch
        y_pred = model(x)
        loss = loss_fn(y_pred,y)
        opt.zero_grad()
        loss.backward()
        opt.step()
    print(f'Epoch: {epoch}, loss: {loss}.')

Epoch: 0, loss: 0.6931472420692444.
Epoch: 1, loss: 0.6931472420692444.
Epoch: 2, loss: 0.6931472420692444.
Epoch: 3, loss: 0.6931472420692444.
Epoch: 4, loss: 0.6931472420692444.
Epoch: 5, loss: 0.6931472420692444.
Epoch: 6, loss: 0.6931472420692444.
Epoch: 7, loss: 0.6931472420692444.
Epoch: 8, loss: 0.6931472420692444.
Epoch: 9, loss: 0.6931472420692444.
Epoch: 10, loss: 0.6931472420692444.
Epoch: 11, loss: 0.6931472420692444.
Epoch: 12, loss: 0.6931472420692444.
Epoch: 13, loss: 0.6931472420692444.
Epoch: 14, loss: 0.6931472420692444.
Epoch: 15, loss: 0.6931472420692444.
Epoch: 16, loss: 0.6931472420692444.
Epoch: 17, loss: 0.6931472420692444.
Epoch: 18, loss: 0.6931472420692444.
Epoch: 19, loss: 0.6931472420692444.
Epoch: 20, loss: 0.6931472420692444.
Epoch: 21, loss: 0.6931472420692444.
Epoch: 22, loss: 0.6931472420692444.
Epoch: 23, loss: 0.6931472420692444.
Epoch: 24, loss: 0.6931472420692444.
Epoch: 25, loss: 0.6931472420692444.
Epoch: 26, loss: 0.6931472420692444.
Epoch: 27, 

## Add testing data

#### Split the data to training data and test data

In [145]:
from sklearn.model_selection import train_test_split

In [149]:
#split the data, by default test data ratio is 0.25
train_x, test_x, train_y, test_y = train_test_split(data.drop(columns = ['left']).values, data.left.values.reshape(-1, 1), random_state=42)

In [150]:
train_x.shape

(11249, 20)

In [151]:
train_y

array([[1],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]], shape=(11249, 1))

In [152]:
test_x

array([[0.65, 0.96, 5.  , ..., 0.  , 0.  , 1.  ],
       [0.88, 0.8 , 3.  , ..., 0.  , 1.  , 0.  ],
       [0.69, 0.98, 3.  , ..., 0.  , 1.  , 0.  ],
       ...,
       [0.67, 0.59, 3.  , ..., 0.  , 1.  , 0.  ],
       [0.22, 0.57, 5.  , ..., 1.  , 0.  , 0.  ],
       [0.36, 0.73, 2.  , ..., 0.  , 1.  , 0.  ]], shape=(3750, 20))

In [153]:
#convert the data to tensor
train_x = torch.from_numpy(train_x).type(torch.FloatTensor)
test_x = torch.from_numpy(test_x).type(torch.FloatTensor)
train_y = torch.from_numpy(train_y).type(torch.FloatTensor)
test_y = torch.from_numpy(test_y).type(torch.FloatTensor)

In [155]:
#convert the data to dataset and dataloader
train_dl = DataLoader(TensorDataset(train_x, train_y), batch_size = batch_size, shuffle = True)

#for testing, the batch size can be bigger
test_dl = DataLoader(TensorDataset(test_x, test_y), batch_size = batch_size*2, shuffle = True)

In [158]:
def accuracy(out, yb):
    return ((out.data.numpy()>0.5) ==yb.numpy()).mean()

#### pytorch has training mode - `model.train()`, and testing mode, `model.eval()`
#### training mode and testing mode will have difference treatment for some layers, e.g. dropout, bn

In [179]:
model, opt = get_model()
for epoch in range(1001):
    #training
    model.train()
    for xb, yb in train_dl:
        y_pred = model(xb)
        loss = loss_fn(y_pred, yb)
        loss.backward()
        opt.zero_grad()
        opt.step()
    #test the result every 100 epochs    
    if epoch%100 == 0:
        model.eval()
        with torch.no_grad():
            valid_loss = sum([loss_fn(model(x), y) for x, y in test_dl])
            acc_mean = np.mean([accuracy(model(x), y) for x, y in test_dl])
        print(f'Epoch: {epoch}, loss: {valid_loss/len(test_dl)}, accuracy: {acc_mean}')
        

## Encapsulation

In [173]:
#calculate loss per batch
def loss_batch (model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
    return loss.item(), len(xb)
        

In [219]:
def fit (epochs, model, loss_fn, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            loss_batch(model, loss_fn, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            #* is super useful here, it makes zip() valid in this context and returns 2 tuples. Without it, zip() is wrong here.
            losses, nums = zip(*[loss_batch(model, loss_fn, xb, yb) for xb, yb in valid_dl])

        val_loss = np.sum(np.multiply(losses, nums))/np.sum(nums)
        acc_mean = np.mean([accuracy(model(x), y) for x, y in valid_dl])
        print(epoch, val_loss, acc_mean)

In [220]:
def get_data(train_ds, test_ds, batch_size):
    return (DataLoader(train_ds, batch_size = batch_size, shuffle=True), DataLoader(test_ds, batch_size = batch_size*2, shuffle=True))

In [221]:
len(train_dl)

176

In [222]:
len(test_dl)

30

In [223]:
loss_fn

BCELoss()

In [225]:
#rewrite the whole process with ecapsulated functions
train_dl, test_dl = get_data(TensorDataset(train_x, train_y), TensorDataset(test_x, test_y), batch_size)
model, opt = get_model()
fit(100, model, loss_fn, opt, train_dl, test_dl)

0 0.6931471824645996 0.7590049342105264
1 0.6931471824645996 0.7614720394736842
2 0.6931471824645996 0.7602384868421053
3 0.6931471824645996 0.7602384868421053
4 0.6931471824645996 0.7627055921052631
5 0.6931471824645996 0.7614720394736842
6 0.6931471824645996 0.7577713815789474
7 0.6931471824645996 0.7633223684210526
8 0.6931471824645996 0.7596217105263158
9 0.6931471824645996 0.7633223684210526
10 0.6931471824645996 0.7602384868421053
11 0.6931471824645996 0.7614720394736842
12 0.6931471824645996 0.7583881578947369
13 0.6931471824645996 0.7627055921052631
14 0.6931471824645996 0.7608552631578948
15 0.6931471824645996 0.7614720394736842
16 0.6931471824645996 0.7590049342105264
17 0.6931471824645996 0.7596217105263158
18 0.6931471824645996 0.7645559210526316
19 0.6931471824645996 0.7620888157894736
20 0.6931471824645996 0.7596217105263158
21 0.6931471824645996 0.7596217105263158
22 0.6931471824645996 0.7596217105263158
23 0.6931471824645996 0.7633223684210526
24 0.6931471824645996 0.76