### 금융상품 갱신 여부 예측하는 ANN

Churn_Modelling.csv 파일을 보면, 고객 정보와 해당 고객이 금융상품을 갱신했는지 안했는지의 여부에 대한 데이터가 있다.

이 데이터를 가지고 갱신여부를 예측하는 딥러닝을 구성하시오.

# Neural Networks and Deep Learning

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


In [2]:
# 구글드라이브 연동한다.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 작업 디렉토리를 셋팅한다.
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/data')

In [5]:
# Churn_Modelling.csv 읽어온다
df = pd.read_csv('Churn_Modelling.csv')

In [6]:
# 몇행몇열인지 확인한다.
df.shape

(10000, 14)

In [7]:
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [8]:
df.isna().sum()

Unnamed: 0,0
RowNumber,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0


In [None]:
df.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [9]:
# y값을 셋팅한다.
y = df['Exited']

In [10]:
# X를 셋팅한다.
X = df.loc[ : ,  'CreditScore' :  'EstimatedSalary' ]

In [11]:
y.head(3)

Unnamed: 0,Exited
0,1
1,0
2,1


In [12]:
X.head(3)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57


In [13]:
# 문자열되어있는 컬럼은 몇개인지 확인한다.
sorted( df['Geography'].unique() )

['France', 'Germany', 'Spain']

In [14]:
sorted( df['Gender'].unique() )

['Female', 'Male']

In [5]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler

In [17]:
df['IsActiveMember'].unique()

array([1, 0])

In [18]:
df['HasCrCard'].unique()

array([1, 0])

In [4]:
from sklearn.compose import ColumnTransformer

In [20]:
ct = ColumnTransformer( [ ( 'label', OrdinalEncoder(), [2]  ) ,
                          ( 'onehot' , OneHotEncoder(), [1]  ) ,
                          ( 'scaler' , MinMaxScaler(), [0, 3, 4, 5, 6, 9] ) ] , remainder='passthrough')

In [21]:
ct

In [22]:
X_scaled = ct.fit_transform(X)

In [23]:
X_scaled[0, ]

array([0.        , 1.        , 0.        , 0.        , 0.538     ,
       0.32432432, 0.2       , 0.        , 0.        , 0.50673489,
       1.        , 1.        ])

In [24]:
y.head(3)

Unnamed: 0,Exited
0,1
1,0
2,1


In [6]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### 딥러닝으로 모델링

In [7]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

In [28]:
# 입력 데이터
X_scaled.shape

(10000, 12)

In [32]:
X_train[ 0 , : ]

array([1.        , 1.        , 0.        , 0.        , 0.672     ,
       0.18918919, 0.6       , 0.        , 0.33333333, 0.89549392,
       1.        , 1.        ])

In [None]:
X_scaled[ 0 ,  ]

array([0.        , 1.        , 0.        , 0.        , 0.538     ,
       0.32432432, 0.2       , 0.        , 0.        , 0.50673489,
       1.        , 1.        ])

In [34]:
# 1. 딥러닝 빈 깡통 틀 => 깡통 인공지능
model = Sequential()

# 2. 인풋 레이어 셋팅
model.add(  keras.Input( shape= (12,) )  )

# 히든 레이어는, 정답이 없습니다. 하고 싶은대로 하시고, 나중에 평가했을때 가장 좋은 것으로 선택!
# 3. 첫번째 히든레이어 셋팅
#    유닛이란? 뉴런(노드)의 갯수를 의미한다.
model.add( Dense(units=6, activation='relu')  )

# 4, 두번째 히든레이어 셋팅
model.add( Dense(units= 8, activation= 'relu' ) )

# 5. 아웃풋 레이어 셋팅
model.add( Dense(units= 1, activation='sigmoid') )

In [35]:
# 내가 만든 모델 요약하기
model.summary()

In [37]:
# 딥러닝은, 컴파일 이라는 작업을 해야 한다.
model.compile( optimizer= keras.optimizers.Adam() ,
              loss= keras.losses.BinaryCrossentropy() ,
               metrics= [ keras.metrics.BinaryAccuracy() ])

In [39]:
X_train.shape

(8000, 12)

In [38]:
# 학습한다.
model.fit(X_train, y_train, epochs=50, batch_size=10)

Epoch 1/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - binary_accuracy: 0.7921 - loss: 0.5263
Epoch 2/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - binary_accuracy: 0.7988 - loss: 0.4603
Epoch 3/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - binary_accuracy: 0.7953 - loss: 0.4543
Epoch 4/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - binary_accuracy: 0.7998 - loss: 0.4490
Epoch 5/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - binary_accuracy: 0.8175 - loss: 0.4274
Epoch 6/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - binary_accuracy: 0.8163 - loss: 0.4320
Epoch 7/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - binary_accuracy: 0.8076 - loss: 0.4358
Epoch 8/50
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - binary_accuracy: 0.8112

<keras.src.callbacks.history.History at 0x7d84938b6000>

In [40]:
y_pred = model.predict(X_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


In [41]:
y_pred

array([[0.058718  ],
       [0.02674182],
       [0.10810696],
       ...,
       [0.7072825 ],
       [0.08726539],
       [0.2811149 ]], dtype=float32)

In [42]:
y_pred.shape

(2000, 1)

In [43]:
X_test.shape

(2000, 12)

In [47]:
y_pred = ( y_pred > 0.5 ).astype( int )

In [48]:
y_pred

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]])

In [49]:
y_test

Unnamed: 0,Exited
6252,0
4684,0
1731,0
4742,0
4521,0
...,...
6412,1
8285,0
7853,1
1095,1


In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [52]:
cm = confusion_matrix(y_test, y_pred)

In [53]:
cm

array([[1550,   57],
       [ 211,  182]])

In [55]:
(1550+182)  / cm.sum()

np.float64(0.866)

In [56]:
accuracy_score(y_test, y_pred)

0.866

In [58]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.46      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000



In [59]:
### 텐서플로우 딥러닝에서는 평가하는 함수가 따로 있다.

In [60]:
model.evaluate(X_test, y_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - binary_accuracy: 0.8696 - loss: 0.3373


[0.338375449180603, 0.8659999966621399]

## 다음 신규 데이터를 통해 분류해 보자

- Geography: France
- Credit Score: 600
- Gender: Male
- Age: 40
- Tenure: 3
- Balance: 60000
- Number of Products: 2
- Has Credit Card: Yes
- Is Active Member: Yes
- Estimated Salary: 50000

In [62]:
X.head(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58


In [76]:
data = [{'CreditScore' : 600, 'Geography' : 'France', 'Gender' :'Male', 'Age': 40, 'Tenure':3,
         'Balance':60000, 'NumOfProducts' : 2, 'HasCrCard' : 1, 'IsActiveMember' : 1, 'EstimatedSalary':50000  }]

In [78]:
new_data = pd.DataFrame(data)

In [80]:
new_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,Male,40,3,60000,2,1,1,50000


In [82]:
new_data = ct.transform(new_data)

In [83]:
new_data

array([[1.        , 1.        , 0.        , 0.        , 0.5       ,
        0.2972973 , 0.3       , 0.23914092, 0.33333333, 0.24996597,
        1.        , 1.        ]])

In [85]:
y_pred = model.predict(new_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step


In [89]:
( y_pred > 0.5 ).astype(int)[ 0 , 0 ]

np.int64(0)

In [90]:
if ( y_pred > 0.5 ).astype(int)[ 0 , 0 ] == 0 :
  print('이탈하지 않을 고객입니다.')
else :
  print('이탈할 고객입니다.')

이탈하지 않을 고객입니다.


### 용어 정리

epoch

- 한 번의 epoch는 신경망에서 전체 데이터 셋에 대해 forward pass/backward pass 과정을 거친 것을 말함. 즉, 전체 데이터 셋에 대해 한 번 학습을 완료한 상태


batch_size

메모리의 한계와 속도 저하 때문에 대부분의 경우에는 한 번의 epoch에서 모든 데이터를 한꺼번에 집어넣을 수는 없습니다. 그래서 데이터를 나누어서 주게 되는데 이때 몇 번 나누어서 주는가를 iteration, 각 iteration마다 주는 데이터 사이즈를 batch size라고 합니다.

출처: https://www.slideshare.net/w0ong/ss-82372826

##  cardio_train.csv 파일이 있다. 세퍼레이터는 ; 세미콜론으로 되어있습니다. 이 데이터를 이용해서 딥러닝 모델링을 하세요.

id (int)
각 레코드의 식별자.

age (int, 일(day) 단위)
생년 후 경과 일수. 보통 년 단위(age_years = age / 365.25)로 변환해 사용.

gender (int: 1/2)
성별: 1 = 여, 2 = 남 (이 데이터 컨벤션).

height (int, cm)
키(센티미터). 정상 범위는 대략 120–220cm 정도로 필터링 권장.

weight (float, kg)
몸무게(킬로그램). 일반적으로 30–200kg 내 값만 사용 권장.

ap_hi (int, mmHg)
수축기 혈압. 정상/경계 범위 고려 시 80–240 mmHg 내 필터링 권장.

ap_lo (int, mmHg)
이완기(최저) 혈압. 보통 40–140 mmHg 내 필터링 권장.

cholesterol (int: 1/2/3)
총콜레스테롤 카테고리: 1 = 정상, 2 = 약간 높음(above normal), 3 = 많이 높음(well above normal)

gluc (int: 1/2/3)
공복 혈당 카테고리: 1 = 정상, 2 = 약간 높음, 3 = 많이 높음

smoke (int: 0/1)
흡연 여부: 0 = 비흡연, 1 = 흡연

alco (int: 0/1)
음주 여부: 0 = 비음주/거의 안함, 1 = 음주

active (int: 0/1)
신체 활동(운동) 여부: 0 = 비활동적, 1 = 활동적

cardio (int: 0/1, 타깃)
심혈관 질환 유무: 0 = 없음, 1 = 있음

-----

다음 컬럼은 만드세요.

파생변수: BMI = weight / (height(m)^2)


In [10]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/딥러닝/data')

In [11]:
df  = pd.read_csv('cardio_train.csv', sep=';')

In [12]:
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [13]:
df.isna().sum()

Unnamed: 0,0
id,0
age,0
gender,0
height,0
weight,0
ap_hi,0
ap_lo,0
cholesterol,0
gluc,0
smoke,0


In [14]:
df.head(2)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1


In [15]:
df['age'] = df['age'] / 365.25

In [16]:
df.shape

(70000, 13)

In [17]:
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50.35729,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55.381246,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51.627652,1,165,64.0,130,70,3,1,0,0,0,1


In [18]:
sorted( df['cholesterol'].unique() )

[np.int64(1), np.int64(2), np.int64(3)]

In [19]:
df['gluc'].unique()

array([1, 2, 3])

In [20]:
df['smoke'].unique()

array([0, 1])

In [21]:
df['alco'].unique()

array([0, 1])

In [22]:
df['active'].unique()

array([1, 0])

In [23]:
df['cardio'].unique()

array([0, 1])

In [24]:
df.head(2)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50.35729,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55.381246,1,156,85.0,140,90,3,1,0,0,1,1


In [25]:
y = df['cardio']
X = df.loc[ : , 'age' : 'active']

In [26]:
X.head(2)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,50.35729,2,168,62.0,110,80,1,1,0,0,1
1,55.381246,1,156,85.0,140,90,3,1,0,0,1


In [27]:
ct = ColumnTransformer( [ ('label', OrdinalEncoder(), ['gender']  ) ,
                          ( 'onhot',  OneHotEncoder(), ['cholesterol','gluc'] ) ,
                          ( 'scaler', MinMaxScaler(), ['age', 'height', 'weight','ap_hi', 'ap_lo' ] ) ] ,
                        remainder= 'passthrough' )

In [28]:
X_scaled = ct.fit_transform(X)

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [30]:
X_train.shape

(56000, 15)

In [31]:
y

Unnamed: 0,cardio
0,0
1,1
2,1
3,1
4,0
...,...
69995,0
69996,1
69997,1
69998,1


In [32]:
# 딥러닝 모델링

In [33]:
model = Sequential()
model.add(  keras.Input( shape= (15,) )   )
model.add( Dense( units= 64 , activation= 'relu' )  )
model.add( Dense( units= 32, activation='relu'))
model.add( Dense( units= 20, activation='relu'))
model.add( Dense(units= 1, activation='sigmoid') )

In [34]:
# 컴파일 한다.

In [35]:
model.compile( optimizer= keras.optimizers.RMSprop() ,
              loss= keras.losses.BinaryCrossentropy() ,
               metrics= [ keras.metrics.BinaryAccuracy() ])

In [36]:
X_train.shape

(56000, 15)

In [37]:
model.fit(X_train, y_train, epochs= 100, batch_size= 32)

Epoch 1/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - binary_accuracy: 0.6162 - loss: 0.6510
Epoch 2/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - binary_accuracy: 0.6394 - loss: 0.6340
Epoch 3/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - binary_accuracy: 0.6406 - loss: 0.6317
Epoch 4/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - binary_accuracy: 0.6413 - loss: 0.6291
Epoch 5/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - binary_accuracy: 0.6442 - loss: 0.6266
Epoch 6/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - binary_accuracy: 0.6431 - loss: 0.6292
Epoch 7/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - binary_accuracy: 0.6463 - loss: 0.6258
Epoch 8/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step -

<keras.src.callbacks.history.History at 0x78f618d90a40>

In [126]:
model.evaluate(X_test, y_test)

[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - binary_accuracy: 0.6988 - loss: 0.5857


[0.5921521186828613, 0.6924285888671875]