In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error

In [2]:
# 데이터 로드
housing = datasets.load_boston()
X = pd.DataFrame(housing['data'], columns=housing['feature_names'])
y = pd.DataFrame(housing['target'], columns=['Target'])


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [3]:
X = X.to_numpy()
y = y.to_numpy().reshape(-1,1)

In [5]:
# Custom Dataset class
class TensorData(Dataset):
  def __init__(self, x_data, y_data):
    self.x_data = torch.FloatTensor(x_data)
    self.y_data = torch.FloatTensor(y_data)
    self.len = self.y_data.shape[0]

  def __getitem__(self,index):
    return self.x_data[index], self.y_data[index]

  def __len__(self):
    return self.len

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)
trainset = TensorData(X_train, y_train)
testset = TensorData(X_test, y_test)
# trainset은 교차검증을 위해 미리 DataLoader을 정의하지 않음
testloader = DataLoader(testset, batch_size=32, shuffle=False)

In [7]:
# 모델 구축
class Regressor(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1= nn.Linear(13,50,bias=True)
    self.fc2= nn.Linear(50,30,bias=True)
    self.fc3= nn.Linear(30,1, bias=True)

  def forward(self,x):
    x = self.fc1(x)
    x = self.fc2(x)
    x = self.fc3(x)
    return x

In [9]:
# 손실함수와 교차 검증 정의
kfold = KFold(n_splits=3, shuffle=True)
criterion = nn.MSELoss()

In [10]:
# 평가함수 정의
def evaluation(dataloader):

  predictions = torch.tensor([],dtype=torch.float)
  actual = torch.tensor([], dtype=torch.float)

  with torch.no_grad():
    model.eval()
    for data in dataloader:
      inputs, values = data
      outputs = model(inputs)

      predictions = torch.cat((predictions,outputs),0)
      actual = torch.cat((actual,values),0)
  
  predictions = predictions.numpy()
  actual = actual.numpy()
  rmse = np.sqrt(mean_squared_error(predictions,actual))
  model.train() # 학습시 다시 train()을 사용할 수 있도록
  return rmse

In [11]:
# 교차 검증을 이용한 학습 및 평가
validation_loss = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(trainset)):

  train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
  val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
  trainloader = DataLoader(trainset, batch_size=32, sampler=train_subsampler)
  valloader = DataLoader(trainset, batch_size=32, sampler=val_subsampler)

  model = Regressor()
  optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

  for epoch in range(400):
    for data in trainloader:
      inputs, values = data
      optimizer.zero_grad()

      outputs = model(inputs)
      loss = criterion(outputs,values)
      loss.backward()
      optimizer.step()

  train_rmse = evaluation(trainloader)
  val_rmse = evaluation(valloader)

  print(f'k-fold: {fold} | Train Loss: {train_rmse:.4f}, Valid Loss: {val_rmse:.4f}')
  validation_loss.append(val_rmse)

k-fold: 0 | Train Loss: 5.6593, Valid Loss: 6.1461
k-fold: 1 | Train Loss: 5.9469, Valid Loss: 6.8466
k-fold: 2 | Train Loss: 4.8507, Valid Loss: 7.2783


In [12]:
validation_loss = np.array(validation_loss)
mean = np.mean(validation_loss)
std = np.std(validation_loss)

print(f"Validation Score: {mean:.4f}  std: {std:.4f}")

Validation Score: 6.7570  std: 0.4665


In [14]:
# 모델 평가
test_rmse = evaluation(testloader)
print(test_rmse)

7.1130323


# 모델 구조 및 가중치 확인

In [15]:
import torch
from torch import nn
import torch.nn.functional as F
from torchsummary import summary

In [16]:
class Regressor(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(13,50)
    self.fc2 = nn.Linear(50,30)
    self.fc3 = nn.Linear(30,1)
    self.dropout = nn.Dropout(0.5)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.dropout(F.relu(self.fc2(x)))
    x = F.relu(self.fc3(x))

    return x

In [17]:
model = Regressor()
print(model)

Regressor(
  (fc1): Linear(in_features=13, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=30, bias=True)
  (fc3): Linear(in_features=30, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [18]:
# 모델 변수
for parameter in model.parameters():
  print(parameter.size())

torch.Size([50, 13])
torch.Size([50])
torch.Size([30, 50])
torch.Size([30])
torch.Size([1, 30])
torch.Size([1])


In [19]:
for name, params in model.named_parameters():
  print(name, params.size())

fc1.weight torch.Size([50, 13])
fc1.bias torch.Size([50])
fc2.weight torch.Size([30, 50])
fc2.bias torch.Size([30])
fc3.weight torch.Size([1, 30])
fc3.bias torch.Size([1])


In [20]:
summary(model, (10,13))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 10, 50]             700
            Linear-2               [-1, 10, 30]           1,530
           Dropout-3               [-1, 10, 30]               0
            Linear-4                [-1, 10, 1]              31
Total params: 2,261
Trainable params: 2,261
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.01
Estimated Total Size (MB): 0.02
----------------------------------------------------------------
