In [None]:
import pandas as pd
import numpy as np
import torch

data = pd.read_csv("C:/Users/82109/Documents/car_evaluation.csv")
data.head()

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety,output
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [None]:
# 데이터 전처리 - 1. 범주 특성을 갖는 데이터를 범주형 타입으로 변환
# 2. 범주형 타입을 텐서로 변환
# 범주형 데이터 -> 넘파이 배열 -> 텐서

categorical_columns = ['price', 'maint', 'doors', 'persons', 'lug_capacity', 'safety']
for category in categorical_columns:
    data[category] = data[category].astype('category')

price = data['price'].cat.codes.values    # 범주형 데이터를 숫자로 변환하기 위해 사용
maint = data['maint'].cat.codes.values
doors = data['doors'].cat.codes.values
persons = data['persons'].cat.codes.values
lug_capacity = data['lug_capacity'].cat.codes.values
safety= data['safety'].cat.codes.values

categorical_data = np.stack([price, maint, doors, persons, lug_capacity, safety], 1)
categorical_data[:10]

array([[3, 3, 0, 0, 2, 1],
       [3, 3, 0, 0, 2, 2],
       [3, 3, 0, 0, 2, 0],
       [3, 3, 0, 0, 1, 1],
       [3, 3, 0, 0, 1, 2],
       [3, 3, 0, 0, 1, 0],
       [3, 3, 0, 0, 0, 1],
       [3, 3, 0, 0, 0, 2],
       [3, 3, 0, 0, 0, 0],
       [3, 3, 0, 1, 2, 1]], dtype=int8)

In [None]:
categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
categorical_data[:10]

tensor([[3, 3, 0, 0, 2, 1],
        [3, 3, 0, 0, 2, 2],
        [3, 3, 0, 0, 2, 0],
        [3, 3, 0, 0, 1, 1],
        [3, 3, 0, 0, 1, 2],
        [3, 3, 0, 0, 1, 0],
        [3, 3, 0, 0, 0, 1],
        [3, 3, 0, 0, 0, 2],
        [3, 3, 0, 0, 0, 0],
        [3, 3, 0, 1, 2, 1]])

In [None]:
outputs = pd.get_dummies(data.output)
outputs = outputs.values
outputs = torch.tensor(outputs).flatten()

print(categorical_data.shape)
print(outputs.shape)

torch.Size([1728, 6])
torch.Size([6912])


In [None]:
# 범주형 컬럼을 N차원으로 변환
categorical_column_sizes = [len(data[column].astype('category').cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in
                               categorical_column_sizes]
print(categorical_embedding_sizes)

# (모든 범주형 컬럼의 고유 값 수, 차원의 크기) 형태의 배열로 출력됨

[(4, 2), (4, 2), (4, 2), (3, 2), (3, 2), (3, 2)]


In [None]:
# 데이터셋 분리
total_records = 1728
test = int(total_records*0.2)

train_data = categorical_data[:total_records - test]
test_data = categorical_data[total_records - test:total_records]
train_outputs = outputs[:total_records - test]
test_outputs = outputs[total_records - test:total_records]

print(len(train_data))
print(len(train_outputs))
print(len(test_data))
print(len(test_outputs))

1383
1383
345
345


In [None]:
# 모델의 네트워크 생성

import torch.nn as nn

class Model(nn.Module):
    def __init__(self, embedding_size, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)

        all_layers = []
        num_catogorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_catogorical_cols       # 입력층의 크기

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))
        self.layers = nn.Sequential(*all_layers)      # 신경망의 모든 계층이 순차적으로 실행되도록 모든 계층에 대한 목록을 nn.Sequential 클래스로 전달

    def forward(self, x_categorical):
        embeddings = []
        for i, e in enumerate(self.all_embeddings):
             embeddings.append(e(x_categorical[:, i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)
        x = self.layers(x)
        return x

In [None]:
# 모델 훈련

# (범주형 칼럼의 임베딩 크기, 출력 크기, 은닉층의 뉴런, 드롭아웃) - 출력층에 4개의 뉴런이 포함되도록 지정
model = Model(categorical_embedding_sizes, 4, [200, 100, 50], p=0.4)
print(model)

Model(
  (all_embeddings): ModuleList(
    (0): Embedding(4, 2)
    (1): Embedding(4, 2)
    (2): Embedding(4, 2)
    (3): Embedding(3, 2)
    (4): Embedding(3, 2)
    (5): Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)


In [None]:
# 손실 함수와 옵티마이저 지정
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
epochs = 500
aggregated_losses = []
train_outputs = train_outputs.to(device=device, dtype=torch.int64)

for i in range(epochs):
    i += 1
    y_pred = model(train_data)
    single_loss = loss_function(y_pred, train_outputs)
    aggregated_losses.append(single_loss)

    if i%25 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

epoch:   1 loss: 1.54265475
epoch:  26 loss: 1.31333125
epoch:  51 loss: 1.22091925
epoch:  76 loss: 1.13867676
epoch: 101 loss: 1.02842546
epoch: 126 loss: 0.89785063
epoch: 151 loss: 0.80984950
epoch: 176 loss: 0.74143010
epoch: 201 loss: 0.68771511
epoch: 226 loss: 0.65044564
epoch: 251 loss: 0.63270903
epoch: 276 loss: 0.61881870
epoch: 301 loss: 0.60940146
epoch: 326 loss: 0.60052449
epoch: 351 loss: 0.59459430
epoch: 376 loss: 0.58830214
epoch: 401 loss: 0.58599389
epoch: 426 loss: 0.57882488
epoch: 451 loss: 0.58171344
epoch: 476 loss: 0.57825309
epoch: 500 loss: 0.5734215975


In [None]:
# test dataset으로 모델 예측

test_outputs = test_outputs.to(device=device, dtype=torch.int64)

with torch.no_grad():
    y_val = model(test_data)
    loss = loss_function(y_val, test_outputs)
print(f'Loss: {loss:.8f}')

Loss: 0.58619040


In [None]:
print(y_val[:5])

tensor([[ 2.2071,  1.1788, -3.0565, -2.8558],
        [ 2.5270,  1.4830, -3.1125, -3.3577],
        [ 4.2722,  2.8437, -5.3717, -5.1644],
        [ 3.2329,  1.9945, -3.7247, -3.6317],
        [ 2.4483,  1.5712, -2.7059, -2.8627]])


In [None]:
# 가장 큰 값을 갖는 인덱스 출력

y_val = np.argmax(y_val, axis=1)
print(y_val[:5])      # 인덱스가 0인 값이 인덱스가 1인 값보다 크므로 처리된 출력은 0

tensor([0, 0, 0, 0, 0])


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(test_outputs,y_val))
print(classification_report(test_outputs,y_val))
print(accuracy_score(test_outputs, y_val))

[[257   1   1]
 [ 86   0   0]
 [  0   0   0]]
              precision    recall  f1-score   support

           0       0.75      0.99      0.85       259
           1       0.00      0.00      0.00        86
           3       0.00      0.00      0.00         0

    accuracy                           0.74       345
   macro avg       0.25      0.33      0.28       345
weighted avg       0.56      0.74      0.64       345

0.744927536231884
