In [2]:
import random, os
import torch
import pandas as pd
import numpy as np

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
DATA_PATH = "../data/"

In [6]:
train = pd.read_csv(f"{DATA_PATH}campaign_train.csv")
test = pd.read_csv(f"{DATA_PATH}campaign_test.csv")

train.shape, test.shape

((1344, 27), (896, 26))

In [102]:
train.head()

Unnamed: 0,ID,출생연도,고객_교육수준,고객_결혼여부,고객_소득,고객_자녀수,고객_청소년수,고객_가입날짜,고객_최신구매일_경과기간,고객_와인_구매금액,...,고객_카탈로그_통한_구매횟수,고객_매장방문_구매횟수,고객_지난달_회사사이트_방문횟수,캠페인1_수락여부,캠페인2_수락여부,캠페인3_수락여부,캠페인4_수락여부,캠페인5_수락여부,불만제기,target
0,train_0,1969,박사,기혼,30396.0,1,0,2014-04-30,22,15,...,1,2,7,0,0,1,0,0,0,1
1,train_1,1967,박사,기혼,36947.0,1,1,2012-08-07,49,88,...,0,4,9,0,0,0,0,0,0,0
2,train_2,1949,석사,사별,47570.0,1,1,2013-05-29,3,67,...,2,2,7,0,0,0,0,0,0,1
3,train_3,1976,석사,기혼,81929.0,1,0,2012-09-29,60,1486,...,4,10,6,0,0,1,0,1,0,1
4,train_4,1982,고졸,기혼,57937.0,0,1,2014-02-16,56,261,...,4,9,3,0,0,0,0,0,0,0


In [103]:
train_ft = train.iloc[:, 1:-1]
test_ft = test.iloc[:,1:]

train_ft.shape, test_ft.shape

((1344, 25), (896, 25))

# 결측치 처리

In [104]:
mask = train_ft.isnull().sum() > 0
cols = train_ft.isnull().sum()[mask].index
cols

Index(['고객_교육수준', '고객_결혼여부', '고객_소득'], dtype='object')

In [105]:
e_level, m_status = train_ft["고객_교육수준"].mode()[0], train_ft["고객_결혼여부"].mode()[0]
s_mean = train_ft["고객_소득"].mean()

In [106]:
train_ft["고객_교육수준"] = train_ft["고객_교육수준"].fillna(e_level)
train_ft["고객_결혼여부"] = train_ft["고객_결혼여부"].fillna(m_status)
train_ft["고객_소득"] = train_ft["고객_소득"].fillna(s_mean)

test_ft["고객_교육수준"] = test_ft["고객_교육수준"].fillna(e_level)
test_ft["고객_결혼여부"] = test_ft["고객_결혼여부"].fillna(m_status)
test_ft["고객_소득"] = test_ft["고객_소득"].fillna(s_mean)

In [107]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

In [108]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [109]:
train_ft.drop(columns="고객_가입날짜", inplace=True)
test_ft.drop(columns="고객_가입날짜", inplace=True)

In [110]:
cols = train_ft.select_dtypes("object").columns
cols

Index(['고객_교육수준', '고객_결혼여부'], dtype='object')

In [111]:
enc = OneHotEncoder(handle_unknown="ignore")
train_ft[enc.get_feature_names_out()] = enc.fit_transform(train_ft[cols]).A
test_ft[enc.get_feature_names_out()] = enc.transform(test_ft[cols]).A

In [112]:
train_ft.drop(columns=cols, inplace=True)
test_ft.drop(columns=cols, inplace=True)

train_ft.shape, test_ft.shape

((1344, 32), (896, 32))

In [113]:
scaler = MinMaxScaler()
train_ft = scaler.fit_transform(train_ft)
test_ft = scaler.transform(test_ft)

- 정답 데이터

In [114]:
target = train["target"].to_numpy().reshape(-1,1)
target.shape

(1344, 1)

# 데이터셋 클래스 만들기

In [115]:
class CampaignDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])

        return item

In [116]:
dataset = CampaignDataset(train_ft, target)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
batch = next(iter(dataloader))
batch

{'x': tensor([[0.7188, 0.0431, 0.5000, 0.0000, 0.2222, 0.0100, 0.0000, 0.0029, 0.0000,
          0.0038, 0.0331, 0.1333, 0.0370, 0.0357, 0.1538, 0.3500, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6979, 0.0530, 0.5000, 0.5000, 0.4949, 0.0589, 0.0151, 0.0116, 0.0155,
          0.0038, 0.0801, 0.2667, 0.1111, 0.0000, 0.3077, 0.4500, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000]]),
 'y': tensor([[1.],
         [0.]])}

# 모델 클래스

In [117]:
class Net(torch.nn.Module):
    def __init__(self, in_features, hidden_size):
        super().__init__()
        self.seq = torch.nn.Sequential(
            torch.nn.Linear(in_features, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(hidden_size, hidden_size // 2),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size // 2, hidden_size // 4),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size // 4, 1) # output layer
        )

    def forward(self, x):
        return self.seq(x)

In [118]:
model = Net(train_ft.shape[1], 32)
model(batch["x"])

tensor([[-0.0741],
        [-0.0732]], grad_fn=<AddmmBackward0>)

# 손실함수, 옵티마이저

In [119]:
loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

# 학습 loop 함수 만들기

In [122]:
def train_loop(dataloader, model, loss_function, optimizer, device):
    epoch_loss = 0
    model.train()

    for batch in dataloader:
        pred = model(batch["x"].to(device))
        loss = loss_function(pred, batch["y"].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

In [121]:
train_loop(dataloader, model, loss_function, optimizer, device)

0.4467489694999087

# 검증 또는 테스트 loop 만들기

In [123]:
@torch.no_grad()
def test_loop(dataloader, model, loss_function, device):
    epoch_loss = 0
    model.eval()

    act = torch.nn.Sigmoid()
    pred_list = []
    for batch in dataloader:
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None:
            loss = loss_function(pred, batch["y"].to(device))
            epoch_loss += loss.item()

        pred = act(pred)
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    epoch_loss /= len(dataloader)

    return epoch_loss, pred

# 조합해서 K-fold 학습 수행

In [149]:
batch_size = 32
loss_function = torch.nn.BCEWithLogitsLoss()
epochs = 100
in_features = train_ft.shape[1]
hidden_size = 16
n_splits = 5 # KFold의 K값

In [150]:
from sklearn.model_selection import KFold
cv = KFold(n_splits, shuffle=True, random_state=SEED)

In [151]:
from sklearn.metrics import roc_auc_score

In [164]:
is_holdout = False
reset_seeds(SEED)
score_list = []

for i, (tri, val) in enumerate(cv.split(train_ft)):
    # 학습 데이터
    x_train = train_ft[tri]
    y_train = target[tri]
    train_dataset = CampaignDataset(x_train, y_train)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 검증 데이터
    x_valid = train_ft[val]
    y_valid = target[val]
    valid_dataset = CampaignDataset(x_valid, y_valid)
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    # 모델 객체 및 옵티마이저 생성
    model = Net(in_features, hidden_size).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    patience = 0 # 조기 종료 조건을 주기 위한 변수
    best_score = 0 # 현재 최고 점수 / mse, mae 등은 np.inf
    for _ in range(epochs):
        train_loss = train_loop(train_dataloader, model, loss_function, optimizer, device)
        valid_loss, pred = test_loop(valid_dataloader, model, loss_function, device)
        score = roc_auc_score(y_valid, pred)
        patience += 1

        if score > best_score:
            best_score = score
            patience = 0
            torch.save(model.state_dict(), f"model{i}.pt")

        if patience == 5:
            break

    score_list.append(best_score)
    print(f"AUC 최고점수: {best_score}")

    if is_holdout:
        break

AUC 최고점수: 0.8624601205246366
AUC 최고점수: 0.8289260658391798
AUC 최고점수: 0.8605150214592275
AUC 최고점수: 0.7854700854700855
AUC 최고점수: 0.9224411951684679


In [162]:
np.mean(score_list)

0.8519624976923195

# 추론하기

In [165]:
test_dataset = CampaignDataset(test_ft)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [168]:
pred_list = []

for i in range(n_splits):
    model = Net(in_features, hidden_size).to(device) # 모델 객체 생성
    state_dict = torch.load(f"model{i}.pt", weights_only=True) # 학습된 가중치 불러오기
    model.load_state_dict(state_dict) # 학습된 가중치를 모델에 적용

    _, pred = test_loop(test_dataloader, model, loss_function, device)
    pred_list.append(pred)

In [175]:
pred = np.mean(pred_list, axis=0)
pred.shape, test_ft.shape

((896, 1), (896, 32))

In [179]:
(pred > 0.5).astype(int)

array([[0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    