In [282]:
# 필용한 라이브러리를 넣는 과정
import pandas as pd
import numpy as np
import torch
import torch.optim as optim

In [283]:
# GPU 사용 정의 + 결과값을 동결하기 위해 seed를 고정해주는 과정
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(777)

if device == 'cuda':
  torch.cuda.manual_seed_all(777)

In [284]:
# 학습과 테스트에 이용할 데이터셋을 불러오는 과정
data = pd.read_csv('train.csv')
data2 = pd.read_csv('test.csv')
submit = pd.read_csv('submit_sample.csv')

In [285]:
data

Unnamed: 0.1,index,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,1869,7010-BRBUU,Male,0,Yes,Yes,72,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),24.10,1734.65,No
1,1,4528,9688-YGXVR,Female,0,No,No,44,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,No,Month-to-month,Yes,Credit card (automatic),88.15,3973.2,No
2,2,6344,9286-DOJGF,Female,1,Yes,No,38,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Bank transfer (automatic),74.95,2869.85,Yes
3,3,6739,6994-KERXL,Male,0,No,No,4,Yes,No,DSL,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,55.90,238.5,No
4,4,432,2181-UAESM,Male,0,No,No,2,Yes,No,DSL,Yes,No,Yes,No,No,No,Month-to-month,No,Electronic check,53.45,119.5,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4783,5981,3772,0684-AOSIH,Male,0,Yes,No,1,Yes,No,Fiber optic,Yes,No,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,95.00,95,Yes
4784,5982,5191,5982-PSMKW,Female,0,Yes,Yes,23,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),91.10,2198.3,No
4785,5983,5226,8044-BGWPI,Male,0,Yes,Yes,12,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Electronic check,21.15,306.05,No
4786,5984,5390,7450-NWRTR,Male,1,No,No,12,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.45,1200.15,Yes


In [286]:
# model을 만드는데 필요 없는 index와 ID, 그리고 알 수 없는 Unnammed: 0, 학습데이터 Y 값에 해당하는 Churn은 빼는 과정 (drop을 사용합니다.)
data_pcs = data.drop(['index','Unnamed: 0','customerID','Churn'],axis=1)
data2_pcs = data2.drop(['index','Unnamed: 0','customerID'],axis=1)

In [287]:
data_pcs.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [288]:
data_pcs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4788 entries, 0 to 4787
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            4788 non-null   object 
 1   SeniorCitizen     4788 non-null   int64  
 2   Partner           4788 non-null   object 
 3   Dependents        4788 non-null   object 
 4   tenure            4788 non-null   int64  
 5   PhoneService      4788 non-null   object 
 6   MultipleLines     4788 non-null   object 
 7   InternetService   4788 non-null   object 
 8   OnlineSecurity    4788 non-null   object 
 9   OnlineBackup      4788 non-null   object 
 10  DeviceProtection  4788 non-null   object 
 11  TechSupport       4788 non-null   object 
 12  StreamingTV       4788 non-null   object 
 13  StreamingMovies   4788 non-null   object 
 14  Contract          4788 non-null   object 
 15  PaperlessBilling  4788 non-null   object 
 16  PaymentMethod     4788 non-null   object 


In [289]:
# 데이터셋의 object Dtype을 str로 바꾸는 과정
list = ['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'TotalCharges']

for c in list:
  data_pcs[c] = data_pcs[c].astype(str)
  data2_pcs[c] = data2_pcs[c].astype(str)

In [290]:
# str의 형태로 바뀐 data 값들을 data preprocessing을 통해 DNN model에 이용할 수 있는 int 형으로 변형해 주는 과정 (scikit-learn의 label encoder를 사용합니다.)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for c in list:
  for_fit = pd.concat([data_pcs[c],data2_pcs[c]],axis=0)
  le.fit(for_fit)
  data_pcs[c] = le.transform(data_pcs[c])
  data2_pcs[c] = le.transform(data2_pcs[c])

In [291]:
# int형으로 변환된 데이터셋을 확인할 수 있습니다.
data_pcs

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,1,0,1,1,72,1,2,2,1,1,1,1,1,1,2,0,1,24.10,1066
1,0,0,0,0,44,1,0,1,0,2,2,0,2,0,0,1,1,88.15,2901
2,0,1,1,0,38,1,2,1,0,0,0,0,0,0,0,1,0,74.95,2126
3,1,0,0,0,4,1,0,0,0,0,0,0,0,2,0,1,2,55.90,1733
4,1,0,0,0,2,1,0,0,2,0,2,0,0,0,0,0,2,53.45,308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4783,1,0,1,0,1,1,0,1,2,0,0,0,2,2,0,1,2,95.00,5543
4784,0,0,1,1,23,1,2,0,2,2,2,2,2,2,2,1,1,91.10,1561
4785,1,0,1,1,12,1,0,2,1,1,1,1,1,1,0,1,2,21.15,2290
4786,1,1,0,0,12,1,2,1,0,0,2,0,2,2,0,1,2,99.45,322


In [292]:
# 데이터를 StandarScaler를 사용해 스케일링하는 과정
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

data_sc = sc.fit_transform(data_pcs)
data2_sc = sc.fit_transform(data2_pcs)

In [293]:
# 데이터 학습에 사용될 Y값 Churn을 사용에 알맞게 설정해주는 과정
# (No와 Yes(str형태)로 구성된 것을 label encoding으로 int형으로 바꾼다.)
train_Y = data.Churn

le.fit(train_Y)
train_Y = le.transform(train_Y)

In [294]:
# 데이터셋을 Dataframe -> tensor의 형태로 변환(정확히는 Dataframe -> array(np.array 사용을 확인!) -> tensor)
train_X = torch.FloatTensor(np.array(data_pcs))
train_Y = torch.FloatTensor(np.array(train_Y))
train_Y = torch.reshape(train_Y,[4788,1])

test_X = torch.FloatTensor(np.array(data2_pcs))

In [295]:
# model의 layer를 정의하는 과정
# layer의 크기가 클수로, 갯수가 많을수록 학습률이 좋지만 과적합의 위험이 있음을 유의!
linear1 = torch.nn.Linear(19,128,bias=True)
linear2 = torch.nn.Linear(128,128,bias=True)
linear3 = torch.nn.Linear(128,128,bias=True)
linear4 = torch.nn.Linear(128,1,bias=True)    

relu = torch.nn.ReLU()                        # 값이 0보다 작으면 0을, 0보다 크면 그대로 값을 출력시켜주는 ReLU 함수 설정
dropout = torch.nn.Dropout(p=0.3)             # 과적합을 예방하기 위한 dropout layer를 정의 지금의 경우 30% dropout
sigmoid = torch.nn.Sigmoid()                  # boolean 문제 이므로 sigmoid layer를 마지막에 추가하기 위한 정의

In [296]:
# note) classification의 경우 class 크기 값을, binary classification 또는 회귀 문제의 경우 1을 넣는다.
# -> 아니다. 2로 넣어도 가능은 하다(CrossEntropyLoss를 사용하면 됨.) 다만 이번의 경우 BCELoss를 사용했으므로 1을 넣은 것!

In [297]:
# layer의 가중치 초기화 과정(xavier를 사용했습니다. orthogonal등의 다른 방법 또한 존재합니다.)
torch.nn.init.xavier_normal_(linear1.weight)
torch.nn.init.xavier_normal_(linear2.weight)
torch.nn.init.xavier_normal_(linear3.weight)
torch.nn.init.xavier_normal_(linear4.weight)

Parameter containing:
tensor([[ 0.0677, -0.0576, -0.1977,  0.1106,  0.2761,  0.1832,  0.0024, -0.0640,
          0.0764,  0.0768, -0.0032, -0.0664, -0.2206,  0.0177, -0.1029, -0.1500,
         -0.1600,  0.0451,  0.1032,  0.1133, -0.1122, -0.0022, -0.0438,  0.1310,
         -0.0796,  0.1102, -0.0759,  0.1093, -0.0033, -0.0248, -0.0121,  0.0348,
         -0.2799,  0.1464, -0.0267, -0.1246,  0.1686,  0.0418, -0.0075, -0.0533,
         -0.1630, -0.0745,  0.0571, -0.1260,  0.2924, -0.0730,  0.0390, -0.0826,
          0.0634, -0.0426,  0.0773, -0.0865, -0.0118, -0.0819, -0.0866,  0.2011,
         -0.0820, -0.0627,  0.2377,  0.1298, -0.1929, -0.0378,  0.0339, -0.0596,
         -0.0315, -0.2007,  0.0871, -0.1169,  0.3504, -0.0810, -0.0861, -0.1223,
         -0.0197,  0.2249,  0.0633,  0.0058,  0.0278, -0.0337, -0.1476,  0.1142,
          0.0756, -0.0367,  0.0727, -0.1756, -0.0949,  0.0942,  0.0965, -0.2226,
          0.0183,  0.0971,  0.1292,  0.0635, -0.0355, -0.0263,  0.1629, -0.0671,
      

In [298]:
# 모델의 레이어를 쌓는 과정
model = torch.nn.Sequential(linear1,relu,dropout,
                            linear2,relu,dropout,
                            linear3,relu,dropout,
                            linear4,sigmoid).to(device)

In [299]:
# cost를 찾기위한 optimizer를 설정(성능이 가장 좋은 Adam을 사용했습니다.)
optimizer = optim.Adam(model.parameters(), lr=0.001)   # lr은 learning rate
loss_fn = torch.nn.BCELoss().to(device)

In [300]:
# GPU를 사용할 경우 모든 데이터가 같은 device에 있어야 하므로 cpu -> gpu로 옮기는 과정
train_X=train_X.to(device)
train_Y=train_Y.to(device)

# 모델 학습을 진행하는 과정
for epoch in range(101):
    optimizer.zero_grad()
    output = model(train_X)
    loss = loss_fn(output, train_Y)
    loss.backward()
    optimizer.step()
    
    # epoch "10" 마다 cost와 model의 정확성을 확인하기 위해 출력
    if epoch%10==0:
        c = output>=torch.FloatTensor([0.5]).to(device)
        c = c.float()
        accuracy = sum(c==train_Y) / len(train_Y)
        print("Epoch: {:4d} Cost: {:.7f} Accuracy: {:.7f}".format(epoch, loss, accuracy.item()))

Epoch:    0 Cost: 41.0741348 Accuracy: 0.5029240
Epoch:   10 Cost: 25.2728767 Accuracy: 0.7343358
Epoch:   20 Cost: 25.1620731 Accuracy: 0.7324561
Epoch:   30 Cost: 24.7119179 Accuracy: 0.7276525
Epoch:   40 Cost: 24.4248276 Accuracy: 0.7107351
Epoch:   50 Cost: 23.0604515 Accuracy: 0.7220134
Epoch:   60 Cost: 22.7020359 Accuracy: 0.7172097
Epoch:   70 Cost: 20.8614960 Accuracy: 0.7098997
Epoch:   80 Cost: 19.3223400 Accuracy: 0.6948621
Epoch:   90 Cost: 16.3047142 Accuracy: 0.6441103
Epoch:  100 Cost: 10.9618444 Accuracy: 0.6102757


In [301]:
# 테스트 데이터를 모델에 넣어 결과값을 계산
test_X = test_X.to(device)
rslt = model(test_X)

# Sigmoid를 통해 0~1값으로 나타난 결과값을 0과 1로 변환하는 과정
prdt = rslt>=torch.FloatTensor([0.5]).to(device)
prdt = prdt.int()

In [302]:
# 테스트 결과값을 csv파일로 내보내는 과정
output = prdt.cpu().detach().numpy()
submit['Churn']=output
submit.to_csv('drop_estimation.csv',index=False)

In [303]:
# 결과 확인
result_chk=pd.read_csv('drop_estimation.csv')
result_chk

Unnamed: 0,index,Churn
0,5,0
1,10,0
2,13,0
3,18,0
4,20,0
...,...,...
1193,5962,1
1194,5968,0
1195,5975,1
1196,5977,0
