In [1]:
git_url = "https://github.com/lovedlim/bigdata_analyst_cert_v2/"

In [6]:
test_url = "https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch5/test.csv"
train_url = "https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch5/train.csv"

# 다중분류 실습

## 데이터 불러오기

In [73]:
import pandas as pd

train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

## 탐색적 데이터 분석(EDA)

In [52]:
print(f"훈련 데이터: {train.shape}")
print(f"테스트 데이터: {test.shape}")

훈련 데이터: (10000, 21)
테스트 데이터: (10000, 20)


In [53]:
train.head()

Unnamed: 0,Delay_from_due_date,Num_of_Delayed_Payment,Num_Credit_Inquiries,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly,Monthly_Balance,Credit_Score,Credit_Mix,...,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Monthly_Inhand_Salary,Changed_Credit_Limit,Outstanding_Debt,Total_EMI_per_month
0,56.0,16.0,11.0,35.598217,120.0,Yes,229.093478,252.385965,1,Bad,...,15.0,36597.56,8.0,10.0,29.0,5.0,3143.796667,22.49,2963.18,122.900223
1,49.0,23.0,12.0,25.553106,120.0,Yes,104.613906,219.105944,1,Bad,...,28.0,32057.3,9.0,8.0,16.0,7.0,2606.441667,1.4,1327.26,164.859426
2,34.0,20.0,6.0,40.039954,174.0,Yes,338.626965,251.265589,1,Bad,...,46.0,75868.8,6.0,10.0,32.0,7.0,6074.4,3.6,1432.71,297.547446
3,21.0,13.0,8.0,25.711678,143.0,NM,116.816864,259.92796,2,Standard,...,46.0,17092.69,7.0,3.0,19.0,7.0,1695.390833,16.4,1417.06,62.79426
4,19.0,13.0,6.0,39.140463,138.0,Yes,87.262887,626.21233,1,Bad,...,45.0,81471.96,6.0,6.0,25.0,5.0,6763.33,27.09,2679.69,202.857783


In [54]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Delay_from_due_date       10000 non-null  float64
 1   Num_of_Delayed_Payment    10000 non-null  float64
 2   Num_Credit_Inquiries      10000 non-null  float64
 3   Credit_Utilization_Ratio  10000 non-null  float64
 4   Credit_History_Age        10000 non-null  float64
 5   Payment_of_Min_Amount     10000 non-null  object 
 6   Amount_invested_monthly   10000 non-null  float64
 7   Monthly_Balance           10000 non-null  float64
 8   Credit_Score              10000 non-null  int64  
 9   Credit_Mix                10000 non-null  object 
 10  Payment_Behaviour         10000 non-null  object 
 11  Age                       10000 non-null  float64
 12  Annual_Income             10000 non-null  float64
 13  Num_Bank_Accounts         10000 non-null  float64
 14  Num_Cre

### object 컬럼의 unique 개수 확인

In [55]:
train.describe(include='O')

Unnamed: 0,Payment_of_Min_Amount,Credit_Mix,Payment_Behaviour
count,10000,10000,10000
unique,3,3,6
top,Yes,Standard,Low_spent_Small_value_payments
freq,5269,4591,3416


In [56]:
test.describe(include='O')

Unnamed: 0,Payment_of_Min_Amount,Credit_Mix,Payment_Behaviour
count,10000,10000,10000
unique,3,3,6
top,Yes,Standard,Low_spent_Small_value_payments
freq,5167,4590,3498


### 결측치 확인

In [57]:
print("결측치 확인")
print(f"훈련 데이터: {train.isnull().sum().sum()}")
print(f"테스트 데이터: {test.isnull().sum().sum()}")

결측치 확인
훈련 데이터: 0
테스트 데이터: 0


In [58]:
train['Credit_Score'].value_counts()

Unnamed: 0_level_0,count
Credit_Score,Unnamed: 1_level_1
2,5237
1,2978
3,1785


## 데이터 전처리

In [74]:
target = train.pop('Credit_Score')
cols = train.columns[train.dtypes == object] # 자료형이 object인 컬럼 확인
cols

Index(['Payment_of_Min_Amount', 'Credit_Mix', 'Payment_Behaviour'], dtype='object')

#### 원-핫 인코딩

In [75]:
data = pd.concat([train, test], axis=0)
data_oh = pd.get_dummies(data)
train_oh = data_oh.iloc[:len(train)].copy()
test_oh = data_oh.iloc[len(train):].copy()
print(train_oh.shape, test_oh.shape)

(10000, 29) (10000, 29)


### 데이터 분할 (훈련, 검증)

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train_oh,
    target,
    test_size = 0.2,
    random_state = 42)

data_list = [X_train, X_val, y_train, y_val]

for data in data_list:
  print(data.shape)

(8000, 29)
(2000, 29)
(8000,)
(2000,)


## 머신러닝 학습 및 평가

#### 1. 랜덤 포레스트

In [77]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train) # 훈련
rf_pred = rf.predict(X_val) # 예측

rf_pred[:10]

array([1, 3, 1, 3, 2, 2, 3, 2, 2, 3])

In [78]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 평가하기
accuracy = accuracy_score(y_val, rf_pred)
precision = precision_score(y_val, rf_pred, average='macro')
recall = recall_score(y_val, rf_pred, average='macro')
roc_auc = roc_auc_score(y_val, rf.predict_proba(X_val), multi_class='ovo')

metrics_list = [accuracy, precision, recall, roc_auc]

for metric in metrics_list:
  print(round(metric, 2))

0.73
0.71
0.71
0.88


#### 2. LightGBM

In [67]:
import lightgbm as lgb

# 데이터 불러오기
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

# target 변수 생성
target = train.pop('Credit_Score')
cols = train.columns[train.dtypes == object]

# 데이터 전처리 (형변환)
train['Payment_of_Min_Amount'] = train['Payment_of_Min_Amount'].astype('category')
train['Credit_Mix'] = train['Credit_Mix'].astype('category')
train['Payment_Behaviour'] = train['Payment_Behaviour'].astype('category')

test['Payment_of_Min_Amount'] = test['Payment_of_Min_Amount'].astype('category')
test['Credit_Mix'] = test['Credit_Mix'].astype('category')
test['Payment_Behaviour'] = test['Payment_Behaviour'].astype('category')


# 데이터 분할(검증, 훈련)
X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state=42)

# 머신러닝 훈련(LightGBM)
lgbmc = lgb.LGBMClassifier(random_state=42, verbose=-1) # verbose=-1: 로그 미출력
lgbmc.fit(X_train, y_train)
pred = lgbmc.predict(X_val)
pred

array([1, 3, 1, ..., 3, 2, 2])

In [68]:
# 평가하기
accuracy = accuracy_score(y_val, pred)
precision = precision_score(y_val, pred, average='macro')
recall = recall_score(y_val, pred, average='macro')

metrics_list = [accuracy, precision, recall, roc_auc]

for metric in metrics_list:
  print(round(metric, 2))

0.73
0.7
0.7
0.88


## 테스트 데이터 예측

In [69]:
pred = lgbmc.predict(test)
pred

array([2, 1, 1, ..., 1, 1, 2])

In [80]:
rf_pred = rf.predict(test_oh)
rf_pred

array([3, 1, 1, ..., 1, 1, 2])

In [70]:
result = pd.DataFrame({'pred':pred})
result

Unnamed: 0,pred
0,2
1,1
2,1
3,2
4,1
...,...
9995,2
9996,2
9997,1
9998,1


In [82]:
rf_result = pd.DataFrame({'rf_pred': rf_pred})
rf_result

Unnamed: 0,rf_pred
0,3
1,1
2,1
3,2
4,1
...,...
9995,2
9996,2
9997,1
9998,1
