In [24]:
# from google.colab import drive
# drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
import numpy as np

# DATA_PATH = "/content/drive/MyDrive/data/"
DATA_PATH = "data/"
train = pd.read_csv(f"{DATA_PATH}loan_default_train.csv")
test = pd.read_csv(f"{DATA_PATH}loan_default_test.csv")
train.shape, test.shape

((19548, 10), (13033, 9))

In [2]:
train.head()

Unnamed: 0,ID,나이,연간소득,주택소유상태,근로기간,대출목적,대출금액,이자율,신용거래기간,target
0,train_0,39,170000,임대,9.0,부채통합,17000,12.99,14,0
1,train_1,22,60000,모기지론,6.0,교육,2800,10.99,4,0
2,train_2,23,48152,모기지론,7.0,의료,9500,7.29,2,0
3,train_3,25,62496,모기지론,1.0,주택개선,8000,7.51,2,0
4,train_4,22,41500,임대,6.0,개인사업,2500,15.62,2,0


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19548 entries, 0 to 19547
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      19548 non-null  object 
 1   나이      19548 non-null  int64  
 2   연간소득    19548 non-null  int64  
 3   주택소유상태  19548 non-null  object 
 4   근로기간    18996 non-null  float64
 5   대출목적    19548 non-null  object 
 6   대출금액    19548 non-null  int64  
 7   이자율     17665 non-null  float64
 8   신용거래기간  19548 non-null  int64  
 9   target  19548 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 1.5+ MB


# 특성으로 사용할 변수 선정

In [28]:
train_ft = train.iloc[:, 1:-1].copy()
test_ft = test.iloc[:, 1:].copy()
train_ft.shape ,test_ft.shape

((19548, 8), (13033, 8))

In [29]:
agg_dict = {
    "연간소득": [
        ("연간소득_mean", "mean"),
        ("연간소득_median", "median"),
        ("연간소득_std", "std"),
        ("연간소득_min", "min"),
        ("연간소득_max", "max"),
    ],
    "주택소유상태": [
        ("주택소유상태_nunique", "nunique"),
    ],
    "근로기간": [
        ("근로기간_mean", "mean"),
        ("근로기간_median", "median"),
        ("근로기간_std", "std"),
        ("근로기간_min", "min"),
        ("근로기간_max", "max"),
    ],
    "대출목적": [
        ("대출목적_nunique", "nunique"),
    ],
    "대출금액": [
        ("대출금액_mean", "mean"),
        ("대출금액_median", "median"),
        ("대출금액_std", "std"),
        ("대출금액_min", "min"),
        ("대출금액_max", "max"),
    ],
    "이자율": [
        ("이자율_mean", "mean"),
        ("이자율_median", "median"),
        ("이자율_std", "std"),
        ("이자율_min", "min"),
        ("이자율_max", "max")
    ],
    "신용거래기간": [
        ("신용거래기간_mean", "mean"),
        ("신용거래기간_median", "median"),
        ("신용거래기간_std", "std"),
        ("신용거래기간_min", "min"),
        ("신용거래기간_max", "max"),
    ]
}

tmp = train_ft.groupby("나이").agg(agg_dict)
tmp.columns = tmp.columns.droplevel(0)
train_ft = train_ft.merge(tmp.reset_index(), on="나이", how="left")
test_ft = test_ft.merge(tmp.reset_index(), on="나이", how="left")

In [31]:
train_ft.shape, test_ft.shape

((19548, 35), (13033, 35))

# 결측치 처리

In [32]:
train_ft.isnull().sum()

Unnamed: 0,0
나이,0
연간소득,0
주택소유상태,0
근로기간,552
대출목적,0
대출금액,0
이자율,1883
신용거래기간,0
연간소득_mean,0
연간소득_median,0


In [33]:
test_ft.isnull().sum()

Unnamed: 0,0
나이,0
연간소득,0
주택소유상태,0
근로기간,343
대출목적,0
대출금액,0
이자율,1233
신용거래기간,0
연간소득_mean,10
연간소득_median,10


In [35]:
a,b = train_ft["근로기간"].mean(), train_ft["이자율"].mean()

In [36]:
train_ft["근로기간"] = train_ft["근로기간"].fillna(a)
train_ft["이자율"] = train_ft["이자율"].fillna(b)

test_ft["근로기간"] = test_ft["근로기간"].fillna(a)
test_ft["이자율"] = test_ft["이자율"].fillna(b)

In [38]:
train_ft.fillna(0, inplace=True)
test_ft.fillna(0, inplace=True)

In [39]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum() # 항상 확인하기!

(0, 0)

# 피처 인코딩
- 범주형 데이터가 있을 경우!!

In [40]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [41]:
cols = ["주택소유상태","대출목적"]
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(train_ft[cols])

In [42]:
train_ft[enc.get_feature_names_out()] = enc.transform(train_ft[cols]).toarray()

In [43]:
test_ft[enc.get_feature_names_out()] = enc.transform(test_ft[cols]).toarray()

In [44]:
train_ft.shape, test_ft.shape

((19548, 45), (13033, 45))

In [45]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)

# 피처 스케일링

In [46]:
scaler = MinMaxScaler()
scaler.fit(train_ft)

In [47]:
train_ft[train_ft.columns] = scaler.transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)

# 정답데이터 별도에 변수에 담기

In [48]:
target = train["target"]

In [49]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(5, shuffle=True, random_state=42)

In [50]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [51]:
from sklearn.ensemble import RandomForestClassifier

params = {
    "random_state" : 42,
    "n_estimators" : 200,
    "n_jobs": -1,
    "max_features": None
}

model = RandomForestClassifier(**params)
scores = cross_val_score(model, train_ft, target, cv=cv, scoring="f1", n_jobs=-1)
scores.mean()

0.7707683966115175

In [52]:
model.fit(train_ft, target)
pred = model.predict(test_ft)

In [53]:
pd.DataFrame(pred, columns=["target"]).to_csv("권지혁.csv")