In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

## 전처리

In [2]:
def preprocessing():
    dataset = pd.read_csv("data/car_evaluation.csv")
    categorical_columns = ["price", "maint", "doors", "persons", "lug_capacity", "safety"]
    for category in categorical_columns:
        dataset[category] = dataset[category].astype("category")

    price = dataset["price"].cat.codes.values
    maint = dataset["maint"].cat.codes.values
    doors = dataset["doors"].cat.codes.values
    persons = dataset["persons"].cat.codes.values
    lug_capacity = dataset["lug_capacity"].cat.codes.values
    safety = dataset["safety"].cat.codes.values
    
    categorical_data =  np.stack([price, maint, doors, persons, lug_capacity, safety],1)
    categorical_data = torch.tensor(categorical_data, dtype=torch.int64)

    categorical_colum_sizes = [len(dataset[column].cat.categories) for column in categorical_columns]
    categorical_embedding_sizes = [(col_size, min(50, (col_size)+1) // 2) for col_size in categorical_colum_sizes]
    
    outputs = dataset["output"].astype("category").cat.codes.values
    outputs = torch.tensor(outputs, dtype=torch.int64).flatten()
    return  categorical_data, outputs,categorical_embedding_sizes

In [3]:
categorical_data, outputs,categorical_embedding_sizes =preprocessing()

## 데이터 훈련,테스트 나누기

In [4]:
X_train, X_test, y_train, y_test = train_test_split( categorical_data, outputs,test_size=0.2,  random_state=42)

## dataset, dataloader

In [5]:
class CarEvaluationDataset(Dataset):
    def __init__(self, categorical_data, outputs):
        self.categorical_data=categorical_data
        self.outputs=outputs
    def __len__(self):
        return len(self.categorical_data)
    def __getitem__(self, idx):
        X=self.categorical_data[idx]
        y=self.outputs[idx]
        return X,y

In [6]:
train_dataset= CarEvaluationDataset(X_train, y_train)
test_dataset=CarEvaluationDataset(X_test, y_test)
train_loader=DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader=DataLoader(test_dataset, batch_size=64, shuffle=False)

## 모델, optimizer, loss_function

In [11]:
class Model(nn.Module):
    def __init__(self, embedding_size, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols
        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i
        all_layers.append(nn.Linear(layers[-1], output_size))
        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_categorical):
        embeddings = []
        for i, e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)
        x = self.layers(x)
        return x          

In [12]:
model = Model(categorical_embedding_sizes, 4, [200, 100, 50], p=0.4)
model

Model(
  (all_embeddings): ModuleList(
    (0-2): 3 x Embedding(4, 2)
    (3-5): 3 x Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)

In [13]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## 학습

In [14]:
epochs = 500
aggregated_losses = []
for epoch in range(epochs):
   model.train()
   for x_batch, y_batch in train_loader:
       y_pred=model(x_batch)
       single_loss= loss_function(y_pred, y_batch)
       aggregated_losses.append(single_loss.item())

       optimizer.zero_grad()
       single_loss.backward()
       optimizer.step()

       if (epoch+1)%25==0:
           print(single_loss.item())
    
       

0.641559898853302
0.6264883875846863
0.5969135761260986
0.5080094337463379
0.5432601571083069
0.654242992401123
0.42240089178085327
0.6040340065956116
0.46807193756103516
0.6929337382316589
0.5118901133537292
0.6174202561378479
0.7030670642852783
0.5443251132965088
0.5464232563972473
0.5439098477363586
0.6161820888519287
0.6234215497970581
0.41722631454467773
0.6559208631515503
0.538278341293335
0.5269096493721008
0.5391817688941956
0.638178825378418
0.5419896245002747
0.5316651463508606
0.4670487940311432
0.5842446088790894
0.46199944615364075
0.5388999581336975
0.6017221212387085
0.5442852973937988
0.3886118531227112
0.6581835150718689
0.43312835693359375
0.49796199798583984
0.5686774849891663
0.5693812966346741
0.6117271184921265
0.4214401841163635
0.4332050383090973
0.4922582805156708
0.4540562629699707
0.6923378109931946
0.5036026239395142
0.4557062089443207
0.3417893648147583
0.41791531443595886
0.4774973690509796
0.5376791954040527
0.4725476801395416
0.5757436156272888
0.5554529