In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/data/"

SEED = 42 # 시드값

# 의료비 예측 데이터
train = pd.read_csv(f"{DATA_PATH}insurance_train.csv")
test = pd.read_csv(f"{DATA_PATH}insurance_test.csv")

# 이진 범주 인코딩
sex_dict = {"male": 1, "female": 0}
smoker_dict = {"yes":1, "no": 0}
train["sex"] = train["sex"].map(sex_dict)
train["smoker"] = train["smoker"].map(smoker_dict)
test["sex"] = test["sex"].map(sex_dict)
test["smoker"] = test["smoker"].map(smoker_dict)

# 특성으로 사용할 변수 선택
train_ft = train.iloc[:,:-1].copy()
test_ft = test.copy()

# 범주형 변수 원핫인코딩
cols = ['region']
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(train[cols])
tmp = pd.DataFrame(
    enc.transform(train_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
train_ft = pd.concat([train_ft,tmp],axis=1).drop(columns=cols)
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
test_ft = pd.concat([test_ft,tmp],axis=1).drop(columns=cols)

# Min-Max Scaling
scaler = MinMaxScaler()
scaler.fit(train_ft)
train_ft = scaler.transform(train_ft)
test_ft = scaler.transform(test_ft)

# 정답 데이터
target = train["target"].to_numpy().reshape(-1,1)

train_ft.shape, test_ft.shape, target.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


((936, 9), (402, 9), (936, 1))

# 재현성 함수(Reproduction)

In [None]:
import random, os
import torch
def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# 데이터셋 클래스
- 회귀 문제에서 입력 데이터와 정답 데이터 모두 float32 형태의 텐서

In [None]:
class InsuranceDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])

        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])

        return item

In [None]:
dt = InsuranceDataset(train_ft, target)
dt[-1]

{'x': tensor([0.3043, 1.0000, 0.3785, 0.2000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000]),
 'y': tensor([4074.4536])}

In [None]:
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'x': tensor([[0.4130, 1.0000, 0.5443, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000],
         [0.0000, 1.0000, 0.3608, 0.2000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000]]),
 'y': tensor([[19214.7051],
         [ 1719.4363]])}

# 모델 클래스

In [None]:
class Net(torch.nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.seq = torch.nn.Sequential(
            torch.nn.Linear(n_features, 12),
            torch.nn.BatchNorm1d(12),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(12, 8),
            torch.nn.BatchNorm1d(8),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(8, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(4, 1)
        )

    def forward(self, x):
        return self.seq(x)

In [None]:
model = Net(train_ft.shape[1])
model(batch["x"])

tensor([[-0.7572],
        [-0.3645]], grad_fn=<AddmmBackward0>)

# 학습 loop 함수화
```
데이터 로더 객체
모델 객체
손실함수 객체
옵티마이저 객체
장치이동 문자열을 담은 변수
```

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    model.train() # 학습 모드로 전환
    epoch_loss = 0

    for batch in dataloader: # 배치단위로 데이터 반환
        pred = model(batch["x"].to(device)) # 예측
        loss = loss_fn(pred, batch["y"].to(device)) # 손실 계산

        optimizer.zero_grad() # 이전 경사 누적을 방지하기 위해 0으로 초기화
        loss.backward() # 역전파
        optimizer.step() # 가중치 업데이트

        epoch_loss += loss.item() # 에폭 loss 를 계산하기 위해 배치 loss 합

    epoch_loss /= len(dataloader) # 에폭 loss 구함

    return epoch_loss

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.MSELoss()

In [None]:
train_loop(dl, model, loss_fn, optimizer, device)

324660137.7633547


# 테스트 loop 함수화
- 검증 loss 계산하는 기능 추가

```
데이터 로더 객체
모델 객체
손실함수 객체
장치이동 문자열을 담은 변수
```

In [None]:
@torch.no_grad() # with 문과 같은 역할
def test_loop(dataloader, model, loss_fn, device):
    model.eval() # 평가 모드
    epoch_loss = 0
    pred_list = []

    for batch in dataloader:
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None: # y 키에 텐서가 있을 경우만 loss 계산
            loss = loss_fn(pred, batch["y"].to(device))
            epoch_loss += loss.item()

        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss, pred

In [None]:
dt = InsuranceDataset(test_ft)
dl = torch.utils.data.DataLoader(dt, batch_size=32, shuffle=False)

In [None]:
_, pred = test_loop(dl, model, loss_fn, device)
pred.shape

(402, 1)

# holdout 방식으로 학습 및 검증

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train_ft, target, random_state=SEED)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((702, 9), (234, 9), (702, 1), (234, 1))

# 하이퍼파라미터

In [None]:
batch_size = 32
loss_fn = torch.nn.L1Loss()
epoch = 20
n_features = x_train.shape[1]

In [None]:
train_dt = InsuranceDataset(x_train, y_train)
train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

valid_dt = InsuranceDataset(x_valid, y_valid)
valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

model = Net(n_features)
optimizer = torch.optim.Adam(model.parameters())

for _ in range(epoch):
    train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
    valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)
    print(train_loss, valid_loss)

13130.258655894886 13428.993225097656
13136.017888849432 13428.663452148438
13141.817693536932 13428.500122070312
13127.648082386364 13428.386840820312
13124.155628551136 13428.235473632812
13128.85000887784 13428.080444335938
13124.741477272728 13427.936279296875
13128.841663707386 13427.805419921875
13119.435413707386 13427.65576171875
13138.929066051136 13427.498779296875
13128.934348366478 13427.322143554688
13130.57284268466 13427.128051757812
13135.081276633522 13426.95458984375
13124.980823863636 13426.765258789062
13122.27490234375 13426.581726074219
13123.333185369318 13426.361328125
13123.519309303978 13426.035461425781
13121.38485440341 13425.869079589844
13127.974964488636 13425.651672363281
13120.546164772728 13425.431457519531
