In [1]:
DATA_PATH = "../data/"
SEED = 42

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv(f"{DATA_PATH}loan_grade_train.csv")
test = pd.read_csv(f"{DATA_PATH}loan_grade_test.csv")

train.shape, test.shape

((19548, 10), (13033, 9))

In [3]:
train.head()

Unnamed: 0,ID,나이,연간소득,주택소유상태,근로기간,대출목적,대출금액,이자율,신용거래기간,target
0,train_0,26,33996,임대,10.0,투자,7000,8.0,2,0
1,train_1,34,84000,임대,3.0,개인사업,9000,5.79,10,0
2,train_2,22,24702,임대,,교육,2000,13.23,4,2
3,train_3,24,69996,임대,3.0,부채통합,2500,12.53,3,1
4,train_4,26,58125,모기지론,10.0,투자,6000,10.25,2,1


# 전처리

In [4]:
cols = ["근로기간", "이자율"]
fill_values = train[cols].mean()
fill_values

근로기간     4.810613
이자율     10.998816
dtype: float64

In [20]:
train[cols] = train[cols].fillna(fill_values)
test[cols] = test[cols].fillna(fill_values)

In [21]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [23]:
train_ft = train.iloc[:, 1:-1].copy()
test_ft = test.iloc[:, 1:].copy()

train_ft.shape, test_ft.shape

((19548, 8), (13033, 8))

In [40]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [39]:
cols = train_ft.select_dtypes("object").columns
cols

Index(['주택소유상태', '대출목적'], dtype='object')

In [43]:
enc = OneHotEncoder(handle_unknown="ignore")
train_ft[enc.get_feature_names_out()] = enc.fit_transform(train_ft[cols]).A
test_ft[enc.get_feature_names_out()] = enc.transform(test_ft[cols]).A

train_ft.drop(columns=cols, inplace=True)
test_ft.drop(columns=cols, inplace=True)

train_ft.shape, test_ft.shape

((19548, 16), (13033, 16))

In [45]:
scaler = MinMaxScaler()
train_ft = scaler.fit_transform(train_ft)
test_ft = scaler.transform(test_ft)

In [57]:
target = train["target"].to_numpy() # 다중 분류는 1차원 형태
target.shape

(19548,)

In [58]:
train_ft.dtype, test_ft.dtype, target.dtype

(dtype('float64'), dtype('float64'), dtype('int64'))

# 데이터셋 클래스

In [1]:
import random, os
import torch

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [66]:
class LoanGradeDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])
        if self.y is not None:
            item["y"] = torch.tensor(self.y[idx]) # 다중 분류에서 int64 타입 유지하기 위해 tensor 함수 사용

        return item

In [67]:
dataset = LoanGradeDataset(train_ft, target)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
batch = next(iter(dataloader))
batch

{'x': tensor([[0.0484, 0.0147, 0.0813, 0.1884, 0.1546, 0.0000, 0.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000],
         [0.1129, 0.0393, 0.0244, 0.2464, 0.0222, 0.2857, 0.0000, 0.0000, 0.0000,
          1.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]),
 'y': tensor([0, 0])}

# 모델 클래스

In [76]:
np.unique(target).shape

(7,)

In [78]:
class Net(torch.nn.Module):
    def __init__(self, in_features):
        super().__init__()

        self.hidden1 = torch.nn.Linear(in_features, 8)
        self.relu1 = torch.nn.ReLU()
        self.hidden2 = torch.nn.Linear(8, 4)
        self.relu2 = torch.nn.ReLU()
        self.output_layer = torch.nn.Linear(4, 7) # 다중 분류 문제에서는 클래스 개수 만큼 노드개수 지정

    def forward(self, x):
        x = self.hidden1(x)
        x = self.relu1(x)
        x = self.hidden2(x)
        x = self.relu2(x)

        return self.output_layer(x)

In [80]:
model = Net(train_ft.shape[1])
pred = model(batch["x"])
pred

tensor([[ 0.3055,  0.0009,  0.2480, -0.1969, -0.6280, -0.5311,  0.3495],
        [ 0.3071,  0.0026,  0.2617, -0.1800, -0.6460, -0.5442,  0.3594]],
       grad_fn=<AddmmBackward0>)

# 학습 loop 함수

In [92]:
def train_loop(dataloader, model, loss_function, optimizer, device):
    model.train()
    epoch_loss = 0

    for batch in dataloader:
        pred = model(batch["x"].to(device))
        loss = loss_function(pred, batch["y"].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)

    return epoch_loss

# 테스트 loop 함수
- 검증 loss 계산 기능 추가

In [90]:
@torch.no_grad()
def test_loop(dataloader, model, loss_function, device):
    epoch_loss = 0
    model.eval()
    pred_list = []
    act = torch.nn.Softmax(dim=1)

    for batch in dataloader:
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None:
            loss = loss_function(pred, batch["y"].to(device))
            epoch_loss += loss.item()

        pred = act(pred) # 각 클래스를 확률값으로 변경
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)

    return epoch_loss, pred

# 하이퍼파라미터

In [101]:
in_features = train_ft.shape[1]
batch_size = 64
loss_function = torch.nn.CrossEntropyLoss()
device = "cuda" if torch.cuda.is_available() else "cpu"
epochs = 100
n_splits = 5

In [104]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from tqdm.auto import tqdm

cv = StratifiedKFold(n_splits, shuffle=True, random_state=SEED)

# 학습하기

In [113]:
reset_seeds(SEED)
score_list = []
is_holdout = True

for i, (tri, val) in enumerate(cv.split(train_ft, target)):
    # 학습 데이터
    x_train = train_ft[tri]
    y_train = target[tri]
    train_dataset = LoanGradeDataset(x_train, y_train)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 검증 데이터
    x_valid = train_ft[val]
    y_valid = target[val]
    valid_dataset = LoanGradeDataset(x_valid, y_valid)
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 생성
    model = Net(in_features).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    best_score = 0
    patience = 0

    for _ in tqdm(range(epochs)):
        train_loss = train_loop(train_dataloader, model, loss_function, optimizer, device)
        valid_loss, pred = test_loop(valid_dataloader, model, loss_function, device)
        pred = np.argmax(pred, axis=1)
        score = f1_score(y_valid, pred, average="macro")
        patience += 1

        if score > best_score:
            best_score = score
            patience = 0
            torch.save(model.state_dict(), f"model{i}.pt")

        if patience == 5:
            break

    print(f"{i} 번째 폴드 Macro F1: {best_score}")
    score_list.append(best_score)

    if is_holdout:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

0 번째 폴드 Macro F1: 0.5066673707818286


  0%|          | 0/100 [00:00<?, ?it/s]

1 번째 폴드 Macro F1: 0.44251386953326893


  0%|          | 0/100 [00:00<?, ?it/s]

2 번째 폴드 Macro F1: 0.4501695780676352


  0%|          | 0/100 [00:00<?, ?it/s]

3 번째 폴드 Macro F1: 0.46134980144175913


  0%|          | 0/100 [00:00<?, ?it/s]

4 번째 폴드 Macro F1: 0.4880739272469696


In [114]:
np.mean(score_list)

0.4697549094142922

# 추론하기

In [115]:
test_dataset = LoanGradeDataset(test_ft)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [125]:
pred_list = []
for i in range(n_splits):
    model = Net(in_features).to(device)
    model_params = torch.load(f"model{i}.pt", weights_only=True)
    model.load_state_dict(model_params)
    _, pred = test_loop(test_dataloader, model, loss_function, device)
    pred_list.append(pred)

pred = np.mean(pred_list, axis=0)
pred = np.argmax(pred, axis=1)
pred

array([1, 1, 2, ..., 1, 0, 0])