In [2]:
# 이중분류는 머신러닝에서 가장 일반적인 분류 문제 중 하나입니다. 이중분류는 두 개의 가능한 결과 중 하나를 선택하는 분류 방법입니다. 
# 예를 들어, 의학 분야에서는 종양이 악성인지 양성인지를 분류하는 문제를 이중분류로 다룹니다. 
# 또한, 금융 분야에서는 고객이 대출을 상환할 수 있는지 여부를 결정하는 문제도 이중분류로 다룰 수 있습니다.

# 머신러닝에서 이중분류를 수행하는 가장 일반적인 알고리즘 중 하나는 로지스틱 회귀입니다. 
# 로지스틱 회귀는 선형 회귀 모델을 기반으로 하지만, 결과를 확률 값으로 출력하여 이진 분류 문제를 해결합니다. 
# 또한, 결정 트리, 서포트 벡터 머신, 나이브 베이즈 분류기 등의 알고리즘도 이중분류 문제를 해결하는 데 사용될 수 있습니다.

# 이중분류 모델을 개발할 때는 데이터를 먼저 수집하고 전처리해야 합니다. 
# 이 데이터는 보통 특성(Features)과 결과(Label)로 구성됩니다. 
# 이러한 데이터를 사용하여 모델을 학습시키고, 이후 새로운 데이터에 대한 예측을 수행할 수 있습니다.

# 이중분류 모델의 성능을 평가하는 방법으로는 정확도(Accuracy), 정밀도(Precision), 재현율(Recall), F1 스코어(F1 Score) 등이 있습니다. 
# 이러한 지표를 사용하여 모델의 성능을 평가하고, 이를 토대로 모델을 개선하거나 다른 알고리즘을 적용할 수 있습니다.
# 파생변수 만들어서 정확성 올리기

# 신용카드 사용자 연체 예측 AI 경진대회

In [3]:
import pandas as pd   
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.linear_model import LogisticRegression
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')


---
# 0. 데이터 전처리
---

In [4]:
train.abs

<bound method NDFrame.abs of        index gender car reality  child_num  income_total  \
0          0      F   N       N          0      202500.0   
1          1      F   N       Y          1      247500.0   
2          2      M   Y       Y          0      450000.0   
3          3      F   N       Y          0      202500.0   
4          4      F   Y       Y          0      157500.0   
...      ...    ...  ..     ...        ...           ...   
26452  26452      F   N       N          2      225000.0   
26453  26453      F   N       Y          1      180000.0   
26454  26454      F   Y       N          0      292500.0   
26455  26455      M   N       Y          0      171000.0   
26456  26456      F   N       N          0       81000.0   

                income_type                       edu_type  \
0      Commercial associate               Higher education   
1      Commercial associate  Secondary / secondary special   
2                   Working               Higher education   
3 

In [5]:
# train데이터 확인
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [6]:
train

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,2.0,-47.0,2.0
26454,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


In [7]:
train.isnull().sum()

index               0
gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type       8171
family_size         0
begin_month         0
credit              0
dtype: int64

In [8]:
# 필요한 데이터만 추출
train_df = train.drop('occyp_type',axis=1)
test_df = test.drop('occyp_type',axis=1)
train_df.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,2.0,-26.0,2.0


In [9]:
# gender,car,reality 0,1 숫자형으로 변경.
train_df['gender'] = train_df['gender'].replace({0: 'Male', 1: 'Female'})
train_df['car'] = train_df['car'].replace({0: 'N', 1: 'Y'})
train_df['reality'] = train_df['car'].replace({0: 'N', 1: 'Y'})

In [10]:
# gender,car,reality 0,1 숫자형으로 변경.

train_df.loc[train_df['gender'] == 'M', 'gender'] = 0
train_df.loc[train_df['gender'] == 'F', 'gender'] = 1

train_df.loc[train_df['car'] == 'N', 'car'] = 0
train_df.loc[train_df['car'] == 'Y', 'car'] = 1

train_df.loc[train_df['reality'] == 'N', 'reality'] = 0
train_df.loc[train_df['reality'] == 'Y', 'reality'] = 1

# int형으로 변경

train_df['gender']=train_df['gender'].astype('int')
train_df['car']=train_df['car'].astype('int')
train_df['reality']=train_df['reality'].astype('int')

In [11]:

test_df.loc[test_df['gender'] == 'M', 'gender'] = 0
test_df.loc[test_df['gender'] == 'F', 'gender'] = 1

test_df.loc[test_df['car'] == 'N', 'car'] = 0
test_df.loc[test_df['car'] == 'Y', 'car'] = 1

test_df.loc[test_df['reality'] == 'N', 'reality'] = 0
test_df.loc[test_df['reality'] == 'Y', 'reality'] = 1

test_df['gender']=test_df['gender'].astype('int')
test_df['car']=test_df['car'].astype('int')
test_df['reality']=test_df['reality'].astype('int')


In [12]:
# DAYS_BIRTH,DAYS_EMPLOYED 양수로 변경

sum(train_df['DAYS_BIRTH'] >= 0) 
train_df['DAYS_BIRTH'] = -1 * train_df['DAYS_BIRTH']

sum(train_df['DAYS_EMPLOYED'] >= 0) # 값 0 모든 사람 실업자 판단
train_df['DAYS_EMPLOYED'] = -1 * train_df['DAYS_EMPLOYED']

In [13]:
sum(test_df['DAYS_BIRTH'] >= 0) 
test_df['DAYS_BIRTH'] = -1 * test_df['DAYS_BIRTH']

sum(test_df['DAYS_EMPLOYED'] >= 0) # 값 0 모든 사람 실업자 판단
test_df['DAYS_EMPLOYED'] = -1 * test_df['DAYS_EMPLOYED']

In [14]:
# DAYS -> year로 변경
train_df['DAYS_BIRTH'] = round(train_df['DAYS_BIRTH']/365,1)
train_df['DAYS_EMPLOYED'] = round(train_df['DAYS_EMPLOYED']/365,1)


In [15]:
test_df['DAYS_BIRTH'] = round(test_df['DAYS_BIRTH']/365,1)
test_df['DAYS_EMPLOYED'] = round(test_df['DAYS_EMPLOYED']/365,1)


In [16]:
# income_total 만단위로 변경
train_df['income_total'] = train_df['income_total']/10000


In [17]:
test_df['income_total'] = test_df['income_total']/10000
test_df.shape

(10000, 18)

In [18]:
print(train_df['income_type'].unique())
print(train_df['edu_type'].unique())
print(train_df['family_type'].unique())
print(train_df['house_type'].unique())

['Commercial associate' 'Working' 'State servant' 'Pensioner' 'Student']
['Higher education' 'Secondary / secondary special' 'Incomplete higher'
 'Lower secondary' 'Academic degree']
['Married' 'Civil marriage' 'Separated' 'Single / not married' 'Widow']
['Municipal apartment' 'House / apartment' 'With parents'
 'Co-op apartment' 'Rented apartment' 'Office apartment']


In [19]:
# 범주형 변수를 인코딩하기 위해 scikit-learn의 전처리 모듈에서 LabelEncoder를 사용

from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
train_df['income_type']=label_encoder.fit_transform(train_df['income_type'])
test_df['income_type']=label_encoder.transform(test_df['income_type'])
########################################################################
train_df['edu_type']=label_encoder.fit_transform(train_df['edu_type'])
test_df['edu_type']=label_encoder.transform(test_df['edu_type'])
########################################################################
train_df['family_type']=label_encoder.fit_transform(train_df['family_type'])
test_df['family_type']=label_encoder.transform(test_df['family_type'])
########################################################################
train_df['house_type']=label_encoder.fit_transform(train_df['house_type'])
test_df['house_type']=label_encoder.transform(test_df['house_type'])


In [20]:
# train_df.shape
test_df=test_df.drop('index',axis=1)
train_df=train_df.drop('index',axis=1)

In [21]:
train_df

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,credit
0,1,0,0,0,20.25,0,1,1,2,38.1,12.9,1,0,0,0,2.0,-6.0,1.0
1,1,0,0,1,24.75,0,4,0,1,31.2,4.2,1,0,0,1,3.0,-5.0,1.0
2,0,1,1,0,45.00,4,1,1,1,52.3,12.1,1,0,1,0,2.0,-22.0,2.0
3,1,0,0,0,20.25,0,4,1,1,41.3,5.7,1,0,1,0,2.0,-37.0,0.0
4,1,1,1,0,15.75,2,1,1,1,41.2,5.8,1,0,0,0,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,1,0,0,2,22.50,2,4,1,1,33.1,5.4,1,0,0,0,4.0,-2.0,1.0
26453,1,0,0,1,18.00,4,1,2,1,41.9,6.8,1,0,0,0,2.0,-47.0,2.0
26454,1,1,1,0,29.25,4,4,0,5,27.6,5.5,1,0,0,0,2.0,-25.0,2.0
26455,0,0,0,0,17.10,4,2,3,1,27.8,0.3,1,0,0,0,1.0,-59.0,2.0


---
# 데이터 모델링
---

In [22]:
# [데이터 모델링]
train_x=train_df.drop('credit', axis=1)  # 독립변수
train_y=train_df[['credit']] # 종속변수
test_x=test_df

print(train_x.shape,train_y.shape,test_x.shape)
train_x

(26457, 17) (26457, 1) (10000, 17)


Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month
0,1,0,0,0,20.25,0,1,1,2,38.1,12.9,1,0,0,0,2.0,-6.0
1,1,0,0,1,24.75,0,4,0,1,31.2,4.2,1,0,0,1,3.0,-5.0
2,0,1,1,0,45.00,4,1,1,1,52.3,12.1,1,0,1,0,2.0,-22.0
3,1,0,0,0,20.25,0,4,1,1,41.3,5.7,1,0,1,0,2.0,-37.0
4,1,1,1,0,15.75,2,1,1,1,41.2,5.8,1,0,0,0,2.0,-26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,1,0,0,2,22.50,2,4,1,1,33.1,5.4,1,0,0,0,4.0,-2.0
26453,1,0,0,1,18.00,4,1,2,1,41.9,6.8,1,0,0,0,2.0,-47.0
26454,1,1,1,0,29.25,4,4,0,5,27.6,5.5,1,0,0,0,2.0,-25.0
26455,0,0,0,0,17.10,4,2,3,1,27.8,0.3,1,0,0,0,1.0,-59.0


# 랜덤 포레스트

In [23]:
Rf = RandomForestClassifier()

Rf.fit(train_x,train_y)
predict = Rf.predict_proba(test_x)
submission.loc[:,1:] = predict
submission

  Rf.fit(train_x,train_y)
  submission.loc[:,1:] = predict


Unnamed: 0,index,0,1,2
0,26457,0.070000,0.120000,0.810000
1,26458,0.320000,0.140000,0.540000
2,26459,0.040000,0.080000,0.880000
3,26460,0.050714,0.110000,0.839286
4,26461,0.050000,0.350000,0.600000
...,...,...,...,...
9995,36452,0.053333,0.344333,0.602333
9996,36453,0.235000,0.575000,0.190000
9997,36454,0.070000,0.116667,0.813333
9998,36455,0.320000,0.296000,0.384000


KFold 교차 검증(K-Fold Cross Validation)은 데이터를 K개의 부분집합으로 나눈 후, K개의 모델을 만들어 각각의 모델을 학습시키고 나머지 부분집합으로 검증하는 과정을 K번 반복하는 교차 검증 방법입니다. 이 방법은 모델이 일반화(generalization) 성능을 측정하는 데 유용합니다.

K-Fold 교차 검증의 작동 방식은 다음과 같습니다.

1. 전체 데이터셋을 K개의 부분집합으로 나눕니다.
2. K개의 부분집합 중 하나를 검증 데이터셋으로 선택하고, 나머지 K-1개의 부분집합을 학습 데이터셋으로 사용합니다.
3. 모델을 학습 데이터셋으로 학습시킵니다.
4. 검증 데이터셋으로 모델을 검증합니다.
5. 위 과정을 K번 반복합니다.
6. K번의 검증 결과를 종합하여 모델의 일반화 성능을 측정합니다.

In [25]:


# k-fold 객체 생성
kf = KFold(n_splits=5)


prediction=np.zeros((10000, 3)) 
# 모델생성 - 랜덤포레스트
Rf = RandomForestClassifier()                                                   
# k-Fold 교차 검증 수행
for train_idx, test_idx in kf.split(train_x):
    X_train, y_train = train_x.iloc[train_idx], train_y.iloc[train_idx]
    X_test, y_test = train_x.iloc[test_idx], train_y.iloc[test_idx]
    Rf.fit(X_train,y_train) # 교차검증 후 모델에 적용

    prediction += Rf.predict_proba(test_x)/5

prediction

  Rf.fit(X_train,y_train) # 교차검증 후 모델에 적용
  Rf.fit(X_train,y_train) # 교차검증 후 모델에 적용
  Rf.fit(X_train,y_train) # 교차검증 후 모델에 적용
  Rf.fit(X_train,y_train) # 교차검증 후 모델에 적용
  Rf.fit(X_train,y_train) # 교차검증 후 모델에 적용


array([[0.044     , 0.13846667, 0.81753333],
       [0.268     , 0.17      , 0.562     ],
       [0.04490476, 0.066     , 0.88909524],
       ...,
       [0.049     , 0.12533333, 0.82566667],
       [0.288     , 0.24466667, 0.46733333],
       [0.144     , 0.24766667, 0.60833333]])

In [None]:
Rf = RandomForestClassifier()

Rf.fit(train_x,train_y)
Rf.score(train_x,train_y)

  Rf.fit(train_x,train_y)


0.9727104358014892

In [None]:
Rf = RandomForestClassifier()

Rf.fit(train_x,train_y)
Rf.score(train_x,train_y)

  Rf.fit(train_x,train_y)


0.9727104358014892

In [None]:
# 정규화 수행
ss = StandardScaler()
ss.fit(train_x)
scaled_x = ss.transform(train_x)
test_scaled = ss.transform(test_x)

lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(scaled_x,train_y)
lr.score(scaled_x,train_y)

  y = column_or_1d(y, warn=True)


0.641682730468307

In [None]:
Rf.fit(scaled_x,train_y)
Rf.score(scaled_x,train_y)

  Rf.fit(scaled_x,train_y)


0.9727104358014892

In [None]:
submission.loc[:,1:] = prediction
submission

  submission.loc[:,1:] = prediction


Unnamed: 0,index,0,1,2
0,26457,0.058333,0.144200,0.797467
1,26458,0.297000,0.152400,0.550600
2,26459,0.037190,0.077667,0.885143
3,26460,0.103133,0.119733,0.777133
4,26461,0.063167,0.340000,0.596833
...,...,...,...,...
9995,36452,0.064067,0.261400,0.674533
9996,36453,0.215667,0.476333,0.308000
9997,36454,0.063500,0.127881,0.808619
9998,36455,0.271000,0.261733,0.467267


In [None]:
submission.to_csv('result_submission2.csv', index=False)

In [None]:
submission.to_csv('result_submission.csv', index=False)