In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
dataset = pd.read_csv('/Users/t08/works/deep_advance/data/car_evaluation.csv')
dataset.head()

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety,output
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 8
fig_size[1] = 6
plt.rcParams["figure.figsize"] = fig_size
dataset.output.value_counts().plot(kind='pie', autopct ='%0.05f%%', 
colors =['lightblue','lightgreen', 'orange', 'pink'], explode = (0.05, 0.05, 0.05, 0.05))

<Axes: ylabel='count'>

In [4]:
categorical_columns = ['price', 'maint', 'doors', 'persons', 'lug_capacity', 'safety']
for category in categorical_columns:
    dataset[category] = dataset[category].astype('category')
price = dataset['price'].cat.codes.values
maint = dataset['maint'].cat.codes.values
doors = dataset["doors"].cat.codes.values
persons = dataset['persons'].cat.codes.values
lug_capacity = dataset['lug_capacity'].cat.codes.values
safety = dataset['safety'].cat.codes.values

categorical_data = np.stack([price, maint, doors, persons, lug_capacity, safety], 1)
categorical_data[:10]

array([[3, 3, 0, 0, 2, 1],
       [3, 3, 0, 0, 2, 2],
       [3, 3, 0, 0, 2, 0],
       [3, 3, 0, 0, 1, 1],
       [3, 3, 0, 0, 1, 2],
       [3, 3, 0, 0, 1, 0],
       [3, 3, 0, 0, 0, 1],
       [3, 3, 0, 0, 0, 2],
       [3, 3, 0, 0, 0, 0],
       [3, 3, 0, 1, 2, 1]], dtype=int8)

In [5]:
categorical_data = torch.tensor(categorical_data, dtype= torch.int64)
categorical_data[:10]

tensor([[3, 3, 0, 0, 2, 1],
        [3, 3, 0, 0, 2, 2],
        [3, 3, 0, 0, 2, 0],
        [3, 3, 0, 0, 1, 1],
        [3, 3, 0, 0, 1, 2],
        [3, 3, 0, 0, 1, 0],
        [3, 3, 0, 0, 0, 1],
        [3, 3, 0, 0, 0, 2],
        [3, 3, 0, 0, 0, 0],
        [3, 3, 0, 1, 2, 1]])

In [6]:
outputs = pd.get_dummies(dataset.output)
outputs = outputs.values
outputs= torch.tensor(outputs).flatten()

print(categorical_data.shape)
print(outputs.shape)

torch.Size([1728, 6])
torch.Size([6912])


In [7]:
categorical_columns_sizes = [len(dataset[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size,min(50, (col_size + 1)//2)) for col_size in categorical_columns_sizes]
print(categorical_embedding_sizes)

[(4, 2), (4, 2), (4, 2), (3, 2), (3, 2), (3, 2)]


In [8]:
total_records = 1728
test_records = int(total_records * .2)

categorical_train_data = categorical_data[: total_records - test_records]
categorical_test_data = categorical_data[total_records - test_records:total_records]
train_outputs = outputs[:total_records - test_records]
test_outputs = outputs[total_records - test_records:total_records]

In [9]:
print(len(categorical_train_data))
print(len(train_outputs))
print(len(categorical_test_data))
print(len(test_outputs))

1383
1383
345
345


In [10]:
class Model(nn.Module):
    def __init__(self, embedding_size, output_size, layers, p=0.4):
        """
        embedding_size: list of (num_categories, embed_dim)
        output_size: 최종 출력 차원 (클래스 수 등)
        layers: [히든1, 히든2, ...]
        """
        super().__init__()
        self.all_embeddings = nn.ModuleList(
            [nn.Embedding(num_cat, emb_dim) for num_cat, emb_dim in embedding_size]
        )
        self.embedding_dropout = nn.Dropout(p)

        # 임베딩들을 concat할 것이므로 입력 차원은 emb_dim들의 합
        input_size = sum(emb_dim for _, emb_dim in embedding_size)

        all_layers = []
        for hidden_dim in layers:
            all_layers.append(nn.Linear(input_size, hidden_dim))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(hidden_dim))
            all_layers.append(nn.Dropout(p))
            # 다음 Linear의 입력 차원은 방금 만든 레이어의 출력차원
            input_size = hidden_dim

        all_layers.append(nn.Linear(layers[-1] if layers else input_size, output_size))
        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_categorical):
        # x_categorical: [batch, num_categorical_cols] (dtype: long)
        # 각 컬럼을 해당 임베딩에 통과 → concat
        embedded = []
        for i, e in enumerate(self.all_embeddings):
            embedded.append(e(x_categorical[:, i]))
        x = torch.cat(embedded, dim=1)            # [batch, sum(emb_dims)]
        x = self.embedding_dropout(x)
        x = self.layers(x)                        # [batch, output_size]
        return x



In [11]:
model = Model(categorical_embedding_sizes, 4 ,[200,100,50], p =0.4)
print(model)

Model(
  (all_embeddings): ModuleList(
    (0-2): 3 x Embedding(4, 2)
    (3-5): 3 x Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)


In [12]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [13]:
if torch.cuda.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [15]:
epochs = 500
aggregated_losses = []
train_outputs = train_outputs.to(device=device, dtype=torch.int64)
for i in range(epochs):
    i += 1
    y_pred = model(categorical_train_data).to(device)
    single_loss = loss_function(y_pred, train_outputs)
    aggregated_losses.append(single_loss)
    
    if i%20 == 1:
        print(f'epoch : {i:3} loss: {single_loss.item():10.8f}')
    
    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()
print(f'epoch : {i:3} loss: {single_loss.item():10.10f}')

epoch :   1 loss: 0.57385707
epoch :  21 loss: 0.55847621
epoch :  41 loss: 0.56392395
epoch :  61 loss: 0.56484187
epoch :  81 loss: 0.56492251
epoch : 101 loss: 0.55930495
epoch : 121 loss: 0.55831927
epoch : 141 loss: 0.56017172
epoch : 161 loss: 0.55761820
epoch : 181 loss: 0.54638219
epoch : 201 loss: 0.55394483
epoch : 221 loss: 0.55148041
epoch : 241 loss: 0.53899449
epoch : 261 loss: 0.54555261
epoch : 281 loss: 0.54546249
epoch : 301 loss: 0.55208701
epoch : 321 loss: 0.54053849
epoch : 341 loss: 0.53296024
epoch : 361 loss: 0.54515284
epoch : 381 loss: 0.53014880
epoch : 401 loss: 0.53657115
epoch : 421 loss: 0.52979082
epoch : 441 loss: 0.52551442
epoch : 461 loss: 0.51401079
epoch : 481 loss: 0.51657969
epoch : 500 loss: 0.5032753944
