<a href="https://colab.research.google.com/github/hikaruyaku/KIKAGAKU-/blob/master/5_3_%E3%83%87%E3%83%BC%E3%82%BF%E3%82%BB%E3%83%83%E3%83%88%E3%81%AE%E6%BA%96%E5%82%99.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# データセットの準備

### 1. データセットの読み込み

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.__version__

'1.7.0+cu101'

In [3]:
import sklearn
sklearn.__version__

'0.22.2.post1'

In [4]:
from sklearn.datasets import load_breast_cancer

In [5]:
# データセットの読込
breast_cancer = load_breast_cancer()

In [6]:
# 入力値と目標値を抽出
x = breast_cancer['data']
t = breast_cancer['target']

In [7]:
x.shape

(569, 30)

In [8]:
t.shape

(569,)

In [9]:
type(x),type(t)

(numpy.ndarray, numpy.ndarray)

In [11]:
# ndaary -> Tensor
x = torch.tensor(x, dtype = torch.float32)
t = torch.tensor(t, dtype = torch.int64)

In [12]:
type(x),type(t)

(torch.Tensor, torch.Tensor)

In [13]:
x.dtype,t.dtype

(torch.float32, torch.int64)

In [14]:
x.shape, t.shape

(torch.Size([569, 30]), torch.Size([569]))

- 入力値：`float32`
- 目標値
  - 回帰: `float32`
  - 二値分類:`float32`
  - 多値分類:`int64`

### 2. DataLoader

In [15]:
# 入力値と目標値を dataset としてまとめる
dataset = torch.utils.data.TensorDataset(x,t)
dataset

<torch.utils.data.dataset.TensorDataset at 0x7f60cd30d240>

In [16]:
type(dataset)

torch.utils.data.dataset.TensorDataset

In [17]:
dataset[0]

(tensor([1.7990e+01, 1.0380e+01, 1.2280e+02, 1.0010e+03, 1.1840e-01, 2.7760e-01,
         3.0010e-01, 1.4710e-01, 2.4190e-01, 7.8710e-02, 1.0950e+00, 9.0530e-01,
         8.5890e+00, 1.5340e+02, 6.3990e-03, 4.9040e-02, 5.3730e-02, 1.5870e-02,
         3.0030e-02, 6.1930e-03, 2.5380e+01, 1.7330e+01, 1.8460e+02, 2.0190e+03,
         1.6220e-01, 6.6560e-01, 7.1190e-01, 2.6540e-01, 4.6010e-01, 1.1890e-01]),
 tensor(0))

In [18]:
dataset[0][0]

tensor([1.7990e+01, 1.0380e+01, 1.2280e+02, 1.0010e+03, 1.1840e-01, 2.7760e-01,
        3.0010e-01, 1.4710e-01, 2.4190e-01, 7.8710e-02, 1.0950e+00, 9.0530e-01,
        8.5890e+00, 1.5340e+02, 6.3990e-03, 4.9040e-02, 5.3730e-02, 1.5870e-02,
        3.0030e-02, 6.1930e-03, 2.5380e+01, 1.7330e+01, 1.8460e+02, 2.0190e+03,
        1.6220e-01, 6.6560e-01, 7.1190e-01, 2.6540e-01, 4.6010e-01, 1.1890e-01])

In [19]:
dataset[0][1]

tensor(0)

- 訓練データ  : ネットワークのパラメータの最適化
- 検証データ  : ネットワークが持つハイパーパラメーターの最適化
- テストデータ:訓練済みネットワークの評価

In [26]:
# 各データセットのサンプル数を決定
# train; val:test = 60% : 20% : 20%
n_train = int(len(dataset) *0.6)
n_val = int(len(dataset)*0.2)
n_test = len(dataset) - n_train - n_val
n_train,n_val,n_test

(341, 113, 115)

In [27]:
# ランダムに分割を行う為，シードを固定して再現性を確保する。
torch.manual_seed(0)

<torch._C.Generator at 0x7f6133621bb8>

In [28]:
# データセットの分割
train,val,test = torch.utils.data.random_split(dataset,[n_train,n_val,n_test])

In [29]:
len(train),len(val),len(test)

(341, 113, 115)

In [30]:
# バッチサイズ
batch_size = 10

In [32]:
train_loader = torch.utils.data.DataLoader(train,batch_size,shuffle=True,drop_last=True)
val_loader = torch.utils.data.DataLoader(val, batch_size,shuffle=True,drop_last=True)
test_loader = torch.utils.data.DataLoader(test, batch_size,shuffle=True,drop_last=True)

In [33]:
x,t = next(iter(train_loader))

In [34]:
x.shape

torch.Size([10, 30])

In [35]:
t.shape

torch.Size([10])

In [36]:
x

tensor([[1.1700e+01, 1.9110e+01, 7.4330e+01, 4.1870e+02, 8.8140e-02, 5.2530e-02,
         1.5830e-02, 1.1480e-02, 1.9360e-01, 6.1280e-02, 1.6010e-01, 1.4300e+00,
         1.1090e+00, 1.1280e+01, 6.0640e-03, 9.1100e-03, 1.0420e-02, 7.6380e-03,
         2.3490e-02, 1.6610e-03, 1.2610e+01, 2.6550e+01, 8.0920e+01, 4.8310e+02,
         1.2230e-01, 1.0870e-01, 7.9150e-02, 5.7410e-02, 3.4870e-01, 6.9580e-02],
        [1.0860e+01, 2.1480e+01, 6.8510e+01, 3.6050e+02, 7.4310e-02, 4.2270e-02,
         0.0000e+00, 0.0000e+00, 1.6610e-01, 5.9480e-02, 3.1630e-01, 1.3040e+00,
         2.1150e+00, 2.0670e+01, 9.5790e-03, 1.1040e-02, 0.0000e+00, 0.0000e+00,
         3.0040e-02, 2.2280e-03, 1.1660e+01, 2.4770e+01, 7.4080e+01, 4.1230e+02,
         1.0010e-01, 7.3480e-02, 0.0000e+00, 0.0000e+00, 2.4580e-01, 6.5920e-02],
        [1.3650e+01, 1.3160e+01, 8.7880e+01, 5.6890e+02, 9.6460e-02, 8.7110e-02,
         3.8880e-02, 2.5630e-02, 1.3600e-01, 6.3440e-02, 2.1020e-01, 4.3360e-01,
         1.3910e+00, 1.740

In [37]:
t

tensor([1, 1, 1, 0, 1, 0, 1, 0, 1, 0])