In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from IPython.display import Image

특성 열 사용하기
  - 자동차 연비 측정하기

In [2]:
Image(url='https://raw.githubusercontent.com/rickiepark/ml-with-pytorch/main/ch13/figures/13_07.png', width=700)

In [3]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

In [4]:
df = pd.read_csv(url, names = column_names,na_values = "?", comment="\t",sep=" ",skipinitialspace=True)
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [5]:
df.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [6]:
df = df.dropna()
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
387,27.0,4,140.0,86.0,2790.0,15.6,82,1
388,44.0,4,97.0,52.0,2130.0,24.6,82,2
389,32.0,4,135.0,84.0,2295.0,11.6,82,1
390,28.0,4,120.0,79.0,2625.0,18.6,82,1
391,31.0,4,119.0,82.0,2720.0,19.4,82,1


In [7]:
from sklearn.model_selection import train_test_split
df_train,df_test =  train_test_split(df,test_size=0.2,random_state=1)
train_stats = df_train.describe()
train_stats.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [8]:
numeric_column_names = ["Cylinders",'Displacement','Horsepower','Weight','Acceleration']
dt_train_norm, df_test_norm = df_train.copy(), df_test.copy()
# 각 컬럼에 해당하는 데이터들을  (x - u)/std
for col_name in numeric_column_names:
  mean = train_stats.T.loc[col_name, 'mean']
  std = train_stats.T.loc[col_name, 'std']
  dt_train_norm.loc[:,col_name] =  (dt_train_norm.loc[:,col_name] - mean) / std
  df_test_norm.loc[:,col_name] =  (df_test_norm.loc[:,col_name] - mean) / std

dt_train_norm.head()


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
334,27.2,-0.824303,-0.530922,-0.499214,-0.555264,-0.001641,81,1
258,18.6,0.351127,0.345625,0.186457,0.776338,1.099115,78,1
139,29.0,-0.824303,-0.89128,-0.525586,-0.874613,0.291894,74,2
310,37.2,-0.824303,-1.008153,-1.000281,-1.110294,0.255202,80,3
349,33.0,-0.824303,-0.823104,-0.762934,-0.908786,-0.552019,81,2


In [9]:
# model year 의 특성값을 구간별로 버킷화
# 경계값을 정의
boundaries =  torch.tensor([73,76,79])
v =  torch.tensor(dt_train_norm['Model Year'].values)
dt_train_norm['Model Year Bucketed'] = torch.bucketize(v,boundaries,right=True)

v =  torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v,boundaries,right=True)

numeric_column_names.append('Model Year Bucketed')

In [10]:
from torch.nn.functional import one_hot
total_origin = len(set(df_test_norm['Origin']))
origin_encoded =  one_hot(torch.from_numpy(dt_train_norm['Origin'].values) % total_origin)
x_train_numeric = torch.tensor(dt_train_norm[numeric_column_names].values)
x_train = torch.cat([x_train_numeric, origin_encoded], 1).float()

origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values) % total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded], 1).float()

In [11]:
y_train = torch.tensor(dt_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

In [13]:
from torch.utils.data import DataLoader, TensorDataset
train_ds = TensorDataset(x_train, y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [15]:
hidden_units = [8,4]
input_size = x_train.shape[1]
all_layers = []
for hidden_unit in hidden_units:
  layer = nn.Linear(input_size, hidden_unit)
  all_layers.append(layer)
  all_layers.append(nn.ReLU())
  input_size = hidden_unit
all_layers.append(nn.Linear(hidden_units[-1], 1))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [16]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [17]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20
for epoch in range(num_epochs):
  loss_hist_train = 0
  for x_batch, y_batch in train_dl:
    pred = model(x_batch)[:,0]
    loss = loss_fn(pred, y_batch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    loss_hist_train += loss.item()
  if epoch % log_epochs == 0:
    print(f"에포크 {epoch} 손실 {loss_hist_train / len(train_dl) : .4f}")


에포크 0 손실  255.7754
에포크 20 손실  8.3599
에포크 40 손실  7.4143
에포크 60 손실  7.0965
에포크 80 손실  6.7644
에포크 100 손실  6.7852
에포크 120 손실  6.4147
에포크 140 손실  6.8357
에포크 160 손실  7.3180
에포크 180 손실  6.4565


In [20]:
with torch.no_grad():
  pred = model(x_test.float())[:,0]
  loss = loss_fn(pred, y_test)
  print(f'테스트 MSE : {loss.item(): .4f}')
  print(f'테스트 MAE : {nn.L1Loss()(pred,y_test).item():.4f}')

테스트 MSE :  9.0232
테스트 MAE : 1.9787


MNIST  손글씨 숫자 분류하기

In [21]:
import torchvision
from torchvision import transforms

image_path = './'
transform = transforms.Compose([transforms.ToTensor()])
mnist_train_dataset = torchvision.datasets.MNIST(root=image_path, train=True, transform=transform,download = True)
mnist_test_dataset = torchvision.datasets.MNIST(root = image_path,train=False, transform=transform, download=False)
batch_size = 64
torch.manual_seed(1)
train_dl = DataLoader(mnist_train_dataset, batch_size, shuffle=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 72824857.67it/s]


Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 72536343.61it/s]

Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 1648877/1648877 [00:00<00:00, 22446030.65it/s]


Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 7459095.05it/s]


Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw



In [23]:
hidden_units = [32,16]
image_size = mnist_train_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
  layer = nn.Linear(input_size, hidden_unit)
  all_layers.append(layer)
  all_layers.append(nn.ReLU())
  input_size = hidden_unit
all_layers.append(nn.Linear(hidden_units[-1],10))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [26]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

torch.manual_seed(1)
num_epochs = 20
for epoch in range(num_epochs):
  accuracy_hist_train = 0
  for x_batch,y_batch in train_dl:
    pred = model(x_batch)
    loss = loss_fn(pred, y_batch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
    accuracy_hist_train += is_correct.sum()
  accuracy_hist_train /= len(train_dl.dataset)
  print(f"에포크 {epoch} 정확도 {accuracy_hist_train: .4f}")

에포크 0 정확도  0.9616
에포크 1 정확도  0.9654
에포크 2 정확도  0.9679
에포크 3 정확도  0.9698
에포크 4 정확도  0.9723
에포크 5 정확도  0.9740
에포크 6 정확도  0.9759
에포크 7 정확도  0.9766
에포크 8 정확도  0.9778
에포크 9 정확도  0.9786
에포크 10 정확도  0.9801
에포크 11 정확도  0.9811
에포크 12 정확도  0.9819
에포크 13 정확도  0.9834
에포크 14 정확도  0.9835
에포크 15 정확도  0.9847
에포크 16 정확도  0.9854
에포크 17 정확도  0.9865
에포크 18 정확도  0.9869
에포크 19 정확도  0.9874


In [28]:
pred = model(mnist_test_dataset.data / 255.)
is_correct = (torch.argmax(pred, dim=1) == mnist_test_dataset.targets).float()
print(f"테스트 정확도:{is_correct.mean():.4f}")

테스트 정확도:0.9675
