In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import matplotlib as mpl
import scipy.stats as stats 

# 시각화 옵션 
from IPython.display import set_matplotlib_formats

set_matplotlib_formats('retina')
mpl.rc('font',family='Malgun Gothic')

In [2]:
df1 = pd.read_csv('01_Contract_Data.csv')

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51301 entries, 0 to 51300
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Index          51301 non-null  int64  
 1   Member_ID      51301 non-null  int64  
 2   Sales_Type     51301 non-null  object 
 3   Contract_Type  51301 non-null  object 
 4   Channel        51301 non-null  object 
 5   Datetime       51301 non-null  object 
 6   Term           51301 non-null  int64  
 7   Payment_Type   51301 non-null  object 
 8   Product_Type   51301 non-null  object 
 9   Amount_Month   51301 non-null  int64  
 10  Customer_Type  51299 non-null  object 
 11  Age            44329 non-null  float64
 12  Address1       51299 non-null  object 
 13  Address2       51299 non-null  object 
 14  State          51301 non-null  object 
 15  Overdue_count  51301 non-null  int64  
 16  Overdue_Type   51301 non-null  object 
 17  Gender         51301 non-null  object 
 18  Credit

In [4]:
df1.head()

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
0,1,66758234,렌탈,일반계약,영업방판,2019-05-06,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,9.0,새마을금고
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드
2,3,66756657,렌탈,일반계약,홈쇼핑/방송,2019-02-28,60,CMS,DES-1,96900,개인,48.0,경기도,경기도,계약확정,0,없음,여자,8.0,우리은행
3,4,66423450,멤버십,멤버십3유형,재계약,2019-05-13,12,CMS,DES-1,66900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,5.0,농협회원조합
4,5,66423204,멤버십,멤버십3유형,재계약,2019-05-10,12,CMS,DES-1,66900,개인,60.0,경기도,경기도,기간만료,12,있음,남자,8.0,농협회원조합


In [5]:
df1['Datetime(dt)'] = pd.to_datetime(df1['Datetime'])

In [6]:
df1['Year'] = df1['Datetime(dt)'].dt.year 
df1['Month'] = df1['Datetime(dt)'].dt.month 
df1['day_of_week'] = df1['Datetime(dt)'].dt.day_name() 
df1['Day'] = df1['Datetime(dt)'].dt.day

In [7]:
df1['Bank(clean)'] = df1['Bank'].replace(np.nan, '미확인')

In [8]:
df1['Address1(clean)'] = df1['Address1'].replace(np.nan, '미확인')

In [9]:
df1['Address2(clean)']= df1['Address2'].replace(np.nan, '미확인')

In [10]:
df2 = df1.drop(columns=['Index','Member_ID','Datetime','Datetime(dt)',
                          'Bank','Address1','Address2'])

In [11]:
df2['State'].unique()

array(['계약확정', '기간만료', '해약확정', '해약진행중'], dtype=object)

In [12]:
Y = df2['State'].replace({'계약확정':0,'기간만료':0,'해약확정':1,'해약진행중':1})
Y.value_counts()

0    50665
1      636
Name: State, dtype: int64

In [13]:
X = pd.get_dummies(df2.drop(columns=['State']))

In [51]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [47]:
sampler = RandomUnderSampler()
sampler.fit(X,Y)

RandomUnderSampler()

In [48]:
X_under, Y_under = sampler.fit_resample(X,Y)

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X_under, Y_under,test_size=0.3,
                                                    random_state=1234)

In [52]:
pipe_list = [('impute',KNNImputer()),
            ('model',DecisionTreeClassifier())]
pipe_model = Pipeline(pipe_list)

hyper_parameter = {'model__max_depth':[9],
                  'model__criterion':['gini','entropy'],
                  'model__min_samples_split':[5],
                  'model__min_samples_leaf':[8],
                  'model__class_weight':['balanced',None]}

grid_model = GridSearchCV(pipe_model, param_grid=hyper_parameter, 
             cv=3, n_jobs=-1, scoring='f1')
grid_model.fit(X_train,Y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('impute', KNNImputer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'model__class_weight': ['balanced', None],
                         'model__criterion': ['gini', 'entropy'],
                         'model__max_depth': [9],
                         'model__min_samples_leaf': [8],
                         'model__min_samples_split': [5]},
             scoring='f1')

In [53]:
best_model= grid_model.best_estimator_
best_model

Pipeline(steps=[('impute', KNNImputer()),
                ('model',
                 DecisionTreeClassifier(class_weight='balanced', max_depth=9,
                                        min_samples_leaf=8,
                                        min_samples_split=5))])

In [54]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)

In [55]:
print(classification_report(Y_train, Y_train_pred))

              precision    recall  f1-score   support

           0       0.76      0.73      0.75       448
           1       0.74      0.76      0.75       442

    accuracy                           0.75       890
   macro avg       0.75      0.75      0.75       890
weighted avg       0.75      0.75      0.75       890



In [56]:
print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.63      0.65      0.64       188
           1       0.65      0.63      0.64       194

    accuracy                           0.64       382
   macro avg       0.64      0.64      0.64       382
weighted avg       0.64      0.64      0.64       382



# Random Forest Model 

In [91]:
pipe_list = [('impute',KNNImputer()),
            ('model',RandomForestClassifier())]
pipe_model = Pipeline(pipe_list)

hyper_parameter = {'model__max_depth':[9],
                  'model__criterion':['gini','entropy'],
                  'model__min_samples_split':[6],
                  'model__min_samples_leaf':[5],
                  'model__n_estimators':[50,100,150,250],
                  'model__class_weight':['balanced',None]}

grid_model = GridSearchCV(pipe_model, param_grid=hyper_parameter, 
             cv=3, n_jobs=-1, scoring='f1')
grid_model.fit(X_train,Y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('impute', KNNImputer()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'model__class_weight': ['balanced', None],
                         'model__criterion': ['gini', 'entropy'],
                         'model__max_depth': range(5, 10),
                         'model__min_samples_leaf': range(5, 10),
                         'model__min_samples_split': range(5, 10),
                         'model__n_estimators': [50, 100, 150, 250]},
             scoring='f1')

In [92]:
best_model= grid_model.best_estimator_
best_model

Pipeline(steps=[('impute', KNNImputer()),
                ('model',
                 RandomForestClassifier(max_depth=9, min_samples_leaf=5,
                                        min_samples_split=6))])

In [93]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)

In [94]:
print(classification_report(Y_train, Y_train_pred))

              precision    recall  f1-score   support

           0       0.67      0.95      0.79       448
           1       0.91      0.54      0.68       442

    accuracy                           0.74       890
   macro avg       0.79      0.74      0.73       890
weighted avg       0.79      0.74      0.73       890



In [95]:
print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.61      0.94      0.74       188
           1       0.88      0.41      0.56       194

    accuracy                           0.67       382
   macro avg       0.74      0.68      0.65       382
weighted avg       0.75      0.67      0.65       382



# Support Vector Machine Model 

In [None]:
pipe_list = [('impute',KNNImputer()),
            ('model',SVC())]
pipe_model = Pipeline(pipe_list)

hyper_parameter = {'model__C':[1,10,50,100],
                  'model__kernel':['linear'],
                  'model__class_weight':[None, 'balanced']}

grid_model = GridSearchCV(pipe_model, param_grid=hyper_parameter, 
             cv=3, n_jobs=-1, scoring='f1')
grid_model.fit(X_train,Y_train)

In [None]:
best_model= grid_model.best_estimator_
best_model

In [None]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)

In [None]:
print(classification_report(Y_train, Y_train_pred))

In [None]:
print(classification_report(Y_test, Y_test_pred))