<font color = "#CC3D3D">
# End-to-End Machine Learning Project #
<p>
- #### *Based on CRISP-DM & scikit-learn*    
<br>
<img align="left" src="http://www.kdnuggets.com/wp-content/uploads/crisp-dm-4-problems-fig1.png" alt="CRISP-DM">

## Step 1: Business Understanding ##

1. Business Objectives
 - 새로운 개인연금상품(PEP: Personal Equity Plan)을 개발하여 기존 고객들을 대상으로 가능한 많은 계좌를 유치
2. Analytics Goals
 - PEP 가입 예측모형 개발
 - 고객 프로파일 개발
 - 다이렉트 메일 광고 효율성 제고
 - 타겟 메일링에 의한 응답률 제고 

## Step 2: Data Understanding ##
1. 데이터 획득 절차
 - 기존고객 DB로부터 시험메일 발송을 위한 표본고객목록을 추출
 - 새로운 금융상품(PEP)의 제안 메일을 발송
 - 고객의 반응을 기록
2. 분석 데이터
 - 학습용 데이터 600건
 - 신규고객 데이터 200건

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Collect Initial Data ###

##### for modeling

In [2]:
df = pd.read_csv("data_pepTestCustomers.csv")
df

Unnamed: 0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pep
0,ID12101,48.0,0,0,17546.00,0,1,0,0,0,0,1
1,ID12102,40.0,1,3,30085.10,1,3,1,0,1,1,0
2,ID12103,,0,0,16575.40,1,0,1,1,1,0,0
3,ID12104,23.0,0,3,20375.40,1,3,0,0,1,0,0
4,ID12105,57.0,0,1,50576.30,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
595,ID12696,61.0,0,0,47025.00,0,2,1,1,1,1,0
596,ID12697,30.0,0,0,9672.25,1,0,1,1,1,0,0
597,ID12698,31.0,0,3,15976.30,1,0,1,1,0,0,1
598,ID12699,29.0,1,0,14711.80,1,0,0,1,0,1,0


##### for deployment

In [3]:
new = pd.read_csv("data_pepNewCustomers.csv")
new.head()

Unnamed: 0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage
0,ID12701,23,1,0,18766.9,1,0,1,1,0,1
1,ID12702,30,1,1,9915.67,0,1,0,1,0,1
2,ID12703,45,0,1,21881.6,0,0,1,1,1,0
3,ID12704,50,1,3,46794.4,1,2,0,1,0,1
4,ID12705,41,0,0,20721.1,1,0,1,1,1,0


### Describe Data ###

In [4]:
# 데이터 구조 살펴보기
# ex) age의 경우 600개중 540개만 있어서 60개가 결측치로 예상
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           600 non-null    object 
 1   age          540 non-null    float64
 2   sex          600 non-null    int64  
 3   region       600 non-null    int64  
 4   income       600 non-null    float64
 5   married      600 non-null    int64  
 6   children     600 non-null    int64  
 7   car          600 non-null    int64  
 8   save_act     600 non-null    int64  
 9   current_act  600 non-null    int64  
 10  mortgage     600 non-null    int64  
 11  pep          600 non-null    int64  
dtypes: float64(2), int64(9), object(1)
memory usage: 56.4+ KB


##### Find Missing Values

In [5]:
# 결측값이 존재하는 속성이 무엇인지, 몇개나 있는지 파악할 수 있는 두번째 방법
# null 갯수 체크
df.isnull().sum()

id              0
age            60
sex             0
region          0
income          0
married         0
children        0
car             0
save_act        0
current_act     0
mortgage        0
pep             0
dtype: int64

## Step 3: Data Preparation ##

### Clean Data ###
##### Replace Missing Values #####

In [6]:
# 평균값으로 결측값 대체하기
# fillna 는 널값을 채우라. 여기서는 평균으로 채우고 있음.
# 분석의 상황에 따라 null 값을 그대로 사용함.
df.age.fillna(round(df.age.mean(),0), inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           600 non-null    object 
 1   age          600 non-null    float64
 2   sex          600 non-null    int64  
 3   region       600 non-null    int64  
 4   income       600 non-null    float64
 5   married      600 non-null    int64  
 6   children     600 non-null    int64  
 7   car          600 non-null    int64  
 8   save_act     600 non-null    int64  
 9   current_act  600 non-null    int64  
 10  mortgage     600 non-null    int64  
 11  pep          600 non-null    int64  
dtypes: float64(2), int64(9), object(1)
memory usage: 56.4+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.age.fillna(round(df.age.mean(),0), inplace=True)


In [7]:
# for Hold-out validation
from sklearn.model_selection import train_test_split  

In [8]:
# scikit-learn에서는 속성과 클래스를 분리하여야 한다.

dfX = df.drop(['id','pep'], axis=1)  # exclude 'id' attribute & class variable
dfy = df['pep']                    # class variable
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.25, random_state=0)

In [9]:
# 지정한 비율(75:25)로 데이터가 잘 나누어졌는지 확인
display(X_train.shape, X_test.shape)
X_train.head()

(450, 10)

(150, 10)

Unnamed: 0,age,sex,region,income,married,children,car,save_act,current_act,mortgage
46,50.0,0,0,13283.9,0,1,1,1,1,0
263,60.0,0,0,46358.4,1,0,1,1,1,1
458,18.0,1,2,13700.2,0,1,0,1,1,0
230,59.0,0,0,30189.4,1,0,1,0,1,1
107,23.0,1,0,13039.9,1,0,0,0,1,0


## Step 4: Modeling ##

In [10]:
# 1. Import the estimator
from sklearn.tree import DecisionTreeClassifier

In [11]:
# 2. Instantiate the estimator
tree = DecisionTreeClassifier(max_depth=6, random_state=0)

In [12]:
# 3. Fit the data to the estimator
tree.fit(X_train, y_train)

In [13]:
# 4. Generate a prediction
pred_tree = tree.predict(X_test); pred_tree

array([0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)

In [14]:
# 5. Evaluate the estimator
display(tree.score(X_train, y_train))
display(tree.score(X_test, y_test))

0.9355555555555556

0.8733333333333333

## Step 5: Evaluation ##

<font color = "red">
- *Which model is the best ?*
- *Is the model useful ?*
<font>

In [15]:
tree.score(X_test, y_test)

0.8733333333333333

## Step 6: Deployment ##

In [16]:
ndf = new

### A Case: Apply the best model to select target customers ###

In [17]:
# 개발 모형에 고객 데이터를 적용하여 개인연금 구매여부를 예측: id 제외
new['pred'] = tree.predict(ndf.loc[:,'age':'mortgage'])

In [18]:
# 개인연금 구매확률을 예측: predict_proba() 사용
print(tree.predict_proba(new.loc[:,'age':'mortgage']))
new['pred_prob'] = tree.predict_proba(new.loc[:,'age':'mortgage'])[:,1]
new.head()

[[0.88       0.12      ]
 [1.         0.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.81395349 0.18604651]
 [0.         1.        ]
 [0.88       0.12      ]
 [0.         1.        ]
 [0.90909091 0.09090909]
 [0.07142857 0.92857143]
 [0.88       0.12      ]
 [0.         1.        ]
 [0.81395349 0.18604651]
 [0.96       0.04      ]
 [1.         0.        ]
 [0.         1.        ]
 [0.88       0.12      ]
 [0.         1.        ]
 [0.         1.        ]
 [1.         0.        ]
 [0.         1.        ]
 [0.86956522 0.13043478]
 [0.         1.        ]
 [0.88       0.12      ]
 [1.         0.        ]
 [0.         1.        ]
 [0.96       0.04      ]
 [0.88       0.12      ]
 [0.         1.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.86956522 0.13043478]
 [0.15       0.85      ]
 [1.         0.        ]
 [0.81395349 0.18604651]
 [0.         1.        ]
 [0.81395349 0.18604651]
 [0.88       0.12      ]
 [1.         0.        ]
 [1.         0.        ]


Unnamed: 0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pred,pred_prob
0,ID12701,23,1,0,18766.9,1,0,1,1,0,1,0,0.12
1,ID12702,30,1,1,9915.67,0,1,0,1,0,1,0,0.0
2,ID12703,45,0,1,21881.6,0,0,1,1,1,0,1,1.0
3,ID12704,50,1,3,46794.4,1,2,0,1,0,1,1,1.0
4,ID12705,41,0,0,20721.1,1,0,1,1,1,0,0,0.186047


In [19]:
# 특정 조건을 만족하는 고객 리스트를 추출하고 저장
# query 조건 만족 행을 뽑을 때 사용
target = new.query('pred == 1 & pred_prob > 0.7')  # PEP에 가입할 확율이 70%가 넘는 고객만 추출
target.sort_values(by="pred_prob", ascending=False).to_csv("pep_target.csv", index=False)
pd.read_csv("pep_target.csv").tail()

Unnamed: 0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pred,pred_prob
76,ID12778,64,1,3,29107.5,0,0,1,0,1,0,1,0.818182
77,ID12873,43,1,3,28741.6,0,0,0,0,0,0,1,0.818182
78,ID12786,18,0,3,15800.6,0,0,1,0,1,0,1,0.818182
79,ID12821,60,0,3,27558.6,0,0,1,1,0,0,1,0.818182
80,ID12818,54,0,3,44982.1,0,0,1,1,0,0,1,0.818182


In [113]:
X_train = dfX
y_train = dfy
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [121]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train); scores

array([0.80833333, 0.84166667, 0.85833333, 0.84166667, 0.81666667])

In [130]:
scores = cross_val_score(model, X_train, y_train, cv=5); scores

array([0.79166667, 0.84166667, 0.89166667, 0.8       , 0.81666667])

In [136]:
from sklearn.model_selection import LeaveOneOut
scores = cross_val_score(model, X_train, y_train, cv=LeaveOneOut())
scores.mean()

0.8383333333333334

In [134]:
from sklearn.model_selection import ShuffleSplit

sscv = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(model, X_train, y_train, cv=sscv)
scores.mean()

0.8156666666666667