<a href="https://colab.research.google.com/github/kobosung4756/what-I-learned-today/blob/main/turnover_DNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
# 필용한 라이브러리를 넣는 과정
import numpy as np
import pandas as pd
import torch
import torch.optim as optim

In [68]:
# 결과값을 동결하기 위해 seed를 고정해주는 과정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(777)
if device == 'cuda':
  torch.cuda.manual_seed_all(777)

In [69]:
# 학습과 테스트에 이용할 데이터셋을 불러오는 과정
data = pd.read_csv('train.csv')
data2 = pd.read_csv('test.csv')
submit = pd.read_csv('submit_sample.csv')

In [70]:
data.head()

Unnamed: 0,index,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
3,4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
4,5,21651,city_176,0.764,,Has relevent experience,Part time course,Graduate,STEM,11,,,1,24,1.0


In [71]:
# model을 만드는데 필요 없는 id와 index, 학습데이터 Y 값에 해당하는 target은 빼는 과정 (drop을 사용합니다.)
data_pcs = data.drop(['index','enrollee_id','target'],axis=1)
data2_pcs = data2.drop(['index','enrollee_id'],axis=1)

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   15326 non-null  int64  
 1   enrollee_id             15326 non-null  int64  
 2   city                    15326 non-null  object 
 3   city_development_index  15326 non-null  float64
 4   gender                  11752 non-null  object 
 5   relevent_experience     15326 non-null  object 
 6   enrolled_university     15009 non-null  object 
 7   education_level         14964 non-null  object 
 8   major_discipline        13068 non-null  object 
 9   experience              15276 non-null  object 
 10  company_size            10612 non-null  object 
 11  company_type            10445 non-null  object 
 12  last_new_job            14983 non-null  object 
 13  training_hours          15326 non-null  int64  
 14  target                  15326 non-null

In [73]:
data.columns

Index(['index', 'enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [74]:
# object인 Dtype을 str로 바꾸는 과정
list=['city', 'gender','relevent_experience', 'enrolled_university', 'education_level','major_discipline', 'experience', 'company_size', 'company_type','last_new_job']

for c in list:
  data_pcs[c] = data_pcs[c].astype(str)
  data2_pcs[c] = data_pcs[c].astype(str)

In [75]:
# str의 형태로 바뀐 data 값들을 data preprocessing을 통해 DNN model에 이용할 수 있는 int 형으로 변형해 주는 과정 (scikit-learn의 label encoder를 사용합니다.)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for c in list:
  for_fit = pd.concat([data_pcs[c],data2_pcs[c]],axis=0)
  le.fit(for_fit)
  data_pcs[c] = le.transform(data_pcs[c])
  data2_pcs[c] = le.transform(data2_pcs[c])

In [76]:
# int형으로 변환된 데이터셋을 확인할 수 있습니다.
data_pcs.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,5,0.92,1,0,3,0,5,21,8,6,0,36
1,76,0.776,1,1,3,0,5,6,4,5,4,47
2,14,0.789,3,1,2,0,1,20,8,5,6,52
3,49,0.767,1,0,3,2,5,21,4,1,3,8
4,56,0.764,3,0,1,0,5,2,8,6,0,24


In [77]:
# 데이터셋을 Dataframe -> array -> tensor의 형태로 변환
train_X = torch.FloatTensor(np.array(data_pcs))
train_Y = torch.FloatTensor(np.array(data.target))
train_Y = torch.reshape(train_Y,[15326,1])

test_X = torch.FloatTensor(np.array(data2_pcs))

In [78]:
# model의 layer를 정의하는 과정
# layer의 크기가 클수로, 갯수가 많을수록 학습률이 좋지만 과적합의 위험이 있음을 유의!
linear1 = torch.nn.Linear(12, 512, bias=True)
linear2 = torch.nn.Linear(512, 512, bias=True)
linear3 = torch.nn.Linear(512, 512, bias=True)
linear4 = torch.nn.Linear(512, 512, bias=True)    
linear5 = torch.nn.Linear(512, 1, bias=True)      # note) classification의 경우 class 크기 값을, binary classification 또는 회귀 문제의 경우 1을 넣는다.

relu = torch.nn.ReLU()
dropout = torch.nn.Dropout(p=0.3)                 # 과적합을 예방하기 위한 dropout layer를 정의 지금의 경우 30% dropout
sigmoid = torch.nn.Sigmoid()                      # boolean 문제 이므로 sigmoid layer를 마지막에 추가하기 위한 정의

In [79]:
# layer의 가중치 초기화 과정(xavier를 사용했습니다. orthogonal등의 다른 방법 또한 존재합니다.)
torch.nn.init.xavier_normal_(linear1.weight)
torch.nn.init.xavier_normal_(linear2.weight)
torch.nn.init.xavier_normal_(linear3.weight)
torch.nn.init.xavier_normal_(linear4.weight)
torch.nn.init.xavier_normal_(linear5.weight)

Parameter containing:
tensor([[ 3.8477e-02,  1.0691e-02, -2.9417e-02,  1.7227e-02,  1.0635e-01,
         -1.9522e-02, -2.4190e-01, -1.3847e-03, -5.0912e-02,  1.2866e-03,
          8.8841e-02, -6.1709e-02,  9.9830e-02, -6.6989e-02, -1.6494e-01,
         -5.2227e-02, -2.1583e-03,  2.7203e-02,  1.2188e-01, -6.3811e-02,
          7.2929e-02, -1.1081e-01, -7.3619e-02,  5.5741e-02, -5.2091e-02,
          3.3675e-02,  7.8344e-02,  4.2376e-02,  8.4417e-02, -5.8366e-03,
          6.9128e-02, -4.6243e-02,  8.6872e-02, -7.5962e-02, -3.2799e-02,
          5.1904e-02,  2.2648e-02,  7.8239e-03,  3.2014e-02,  8.8581e-03,
         -5.2649e-02, -2.1145e-02, -1.2804e-01, -3.3371e-03, -1.2997e-01,
          7.9732e-02,  8.0508e-02,  9.3124e-02, -1.2952e-01, -1.0846e-01,
         -7.8169e-02, -3.8226e-03,  3.4837e-02,  1.5538e-02, -9.9161e-02,
          1.3964e-02,  1.2077e-03,  1.3385e-02,  2.5345e-02,  7.3966e-02,
         -2.9488e-03, -9.9205e-02,  1.0622e-02,  8.4400e-02, -6.8867e-02,
         -4.2465

In [80]:
model = torch.nn.Sequential(linear1,relu,dropout,
                          linear2,relu,dropout,
                          linear3,relu,dropout,
                          linear4,relu,dropout,
                          linear5,sigmoid).to(device)

In [81]:
# cost를 찾기위한 optimizer를 설정(성능이 가장 좋은 Adam을 사용했습니다.)
## note) Binary classification의 경우 BCELoss()를, classification은 CrossEntropyLoss(), 회귀문제는 MSELoss()를 사용한다!
optimizer = optim.Adam(model.parameters(), lr=0.001)   # lr은 learning rate
loss_fn = torch.nn.BCELoss().to(device)

In [82]:
# GPU를 사용할 경우 모든 데이터가 같은 device에 있어야 하므로 cpu -> gpu로 옮기는 과정
train_X=train_X.to(device)
train_Y=train_Y.to(device)

# 모델 학습을 진행하는 과정
for epoch in range(501):
    optimizer.zero_grad()
    output = model(train_X)
    loss = loss_fn(output, train_Y)
    loss.backward()
    optimizer.step()

    # epoch "10" 마다 cost와 model의 정확성을 확인하기 위해 출력
    if epoch%10==0:
        c = output>=torch.FloatTensor([0.5]).to(device)
        c = c.float()
        accuracy = sum(c==train_Y) / len(train_Y)
        print("Epoch: {:4d} Cost: {:.7f} Accuracy: {:.7f}".format(epoch, loss, accuracy.item()))

Epoch:    0 Cost: 3.3835049 Accuracy: 0.3994519
Epoch:   10 Cost: 0.6917990 Accuracy: 0.6703641
Epoch:   20 Cost: 0.6213619 Accuracy: 0.7145374
Epoch:   30 Cost: 0.6021419 Accuracy: 0.7498369
Epoch:   40 Cost: 0.5818267 Accuracy: 0.7505546
Epoch:   50 Cost: 0.5752196 Accuracy: 0.7504241
Epoch:   60 Cost: 0.5701115 Accuracy: 0.7505546
Epoch:   70 Cost: 0.5598721 Accuracy: 0.7505546
Epoch:   80 Cost: 0.5540182 Accuracy: 0.7504894
Epoch:   90 Cost: 0.5467923 Accuracy: 0.7502936
Epoch:  100 Cost: 0.5448428 Accuracy: 0.7503589
Epoch:  110 Cost: 0.5374724 Accuracy: 0.7506199
Epoch:  120 Cost: 0.5351830 Accuracy: 0.7503589
Epoch:  130 Cost: 0.5354242 Accuracy: 0.7513376
Epoch:  140 Cost: 0.5311994 Accuracy: 0.7506199
Epoch:  150 Cost: 0.5298507 Accuracy: 0.7499674
Epoch:  160 Cost: 0.5266197 Accuracy: 0.7514681
Epoch:  170 Cost: 0.5238957 Accuracy: 0.7514681
Epoch:  180 Cost: 0.5235947 Accuracy: 0.7522511
Epoch:  190 Cost: 0.5201975 Accuracy: 0.7540128
Epoch:  200 Cost: 0.5194960 Accuracy: 0.

In [83]:
# 테스트 데이터를 모델에 넣어 결과값을 계산
test_X = test_X.to(device)
rslt = model(test_X)

# Sigmoid를 통해 0~1값으로 나타난 결과값을 0과 1로 변환하는 과정
prdt = rslt>=torch.FloatTensor([0.5]).to(device)
prdt = prdt.float()

In [84]:
# 테스트 결과값을 csv파일로 내보내는 과정
prdt=prdt.cpu().detach().numpy()                    # gpu에 있는 tensor형태 -> cpu에 array형태로 변경
submit['target']=prdt
submit.to_csv('turnover_result.csv',index=False)

In [85]:
# 결과값 확인
rslt_csv=pd.read_csv('turnover_result.csv')
rslt_csv

Unnamed: 0,index,target
0,2,1.0
1,9,0.0
2,10,1.0
3,11,0.0
4,15,1.0
...,...,...
3827,19129,0.0
3828,19132,0.0
3829,19135,0.0
3830,19149,0.0
