adultdデータに対してMLPを用いて分類してみる

In [61]:
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [119]:
FEATURES_NUM=81
EPOCHS_NUM=50
BATCH_SIZE=2000

In [120]:
# load data
X=pd.read_csv("./data/adultd_bindata.csv")

In [121]:
def onehot_encode(y):
    """
    labelが数値で与えられている場合にそれをonehotベクトルにencodeする。
    (labelが複数ある場合)
    """
    y = np.array(y).reshape(1, -1)
    y = y.transpose()
    encoder = OneHotEncoder(n_values=max(y)+1)
    y = encoder.fit_transform(y).toarray()
    return y

In [122]:
# create dataset
x=np.array(X.iloc[:, 0:FEATURES_NUM])
y=np.array(X["y"])
s=np.array(X["s"])
#y=onehot_encode(y)内部で勝手にラベルは変換されるぽいからこれいらない
print("data  :   x:{} y:{}".format(len(x),len(y)))
x_train, x_test, y_train, y_test ,s_train,s_test= train_test_split(x, y, s,test_size=0.1)
print("train  :  x:{} y:{} s:{}".format(len(x_train),len(y_train),len(s_train)))
print("test   :   x:{} y:{} s:{}".format(len(x_test),len(y_test),len(s_test)))
print(x.shape)
print(y.shape)
TEST_BATCH_SIZE=len(x_test)

data  :   x:16281 y:16281
train  :  x:14652 y:14652 s:14652
test   :   x:1629 y:1629 s:1629
(16281, 81)
(16281,)


In [123]:
train = torch.utils.data.TensorDataset(torch.from_numpy(x_train).float(), torch.from_numpy(y_train).long())
train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
test = torch.utils.data.TensorDataset(torch.from_numpy(x_test).float(), torch.from_numpy(y_test).long())
test_loader = torch.utils.data.DataLoader(test, batch_size=TEST_BATCH_SIZE, shuffle=False)

In [124]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(FEATURES_NUM, 40) # 入力層から隠れ層へ
        self.l2 = nn.Linear(40, 2)
        
    def forward(self, x):
        x = x.view(-1, FEATURES_NUM)
        x = F.relu(self.l1(x))
        x=self.l2(x)
        return x
    
model = MLP()
print(model)

MLP(
  (l1): Linear(in_features=81, out_features=40, bias=True)
  (l2): Linear(in_features=40, out_features=2, bias=True)
)


In [125]:
# コスト関数と最適化手法を定義
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [126]:
for epoch in range(EPOCHS_NUM):
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        
        # Variableに変換
        inputs, labels = Variable(inputs), Variable(labels)
        
        # 勾配情報をリセット
        optimizer.zero_grad()
        
        # 順伝播
        outputs = model(inputs)
        
        # コスト関数を使ってロスを計算する
        loss = criterion(outputs, labels)
        
        # 逆伝播
        loss.backward()
        
        # パラメータの更新
        optimizer.step()
        
        running_loss += loss.item()
        
    print("epoch{} loss={}".format(epoch+1,running_loss))
    running_loss = 0.0
            
print('Finished Training')

epoch1 loss=181229.56649184227
epoch2 loss=5.459144055843353
epoch3 loss=5.3813722133636475
epoch4 loss=5.3097785115242
epoch5 loss=5.244743883609772
epoch6 loss=5.179376244544983
epoch7 loss=5.120151698589325
epoch8 loss=5.0642513036727905
epoch9 loss=5.021057307720184
epoch10 loss=4.9737274050712585
epoch11 loss=4.93659508228302
epoch12 loss=4.895706653594971
epoch13 loss=4.854106962680817
epoch14 loss=4.8385180830955505
epoch15 loss=4.807818830013275
epoch16 loss=4.769539475440979
epoch17 loss=4.749043703079224
epoch18 loss=4.723015904426575
epoch19 loss=4.694417774677277
epoch20 loss=4.675016760826111
epoch21 loss=4.652465879917145
epoch22 loss=4.626520037651062
epoch23 loss=4.62596070766449
epoch24 loss=4.607298195362091
epoch25 loss=4.581234931945801
epoch26 loss=4.56935727596283
epoch27 loss=4.569323182106018
epoch28 loss=4.552716374397278
epoch29 loss=4.546347081661224
epoch30 loss=4.533331751823425
epoch31 loss=4.518056869506836
epoch32 loss=4.5142635107040405
epoch33 loss=4.4

In [127]:
import torch

correct = 0
total = 0
for data in test_loader:
    inputs, labels = data
    outputs = model(Variable(inputs))
    _, predicted= torch.max(outputs.data,1)#2番目の引数はどの次元について最大のものを取ってくるかを指定している(axis的なサムシング)
    total += labels.size(0)
    correct += (predicted == labels).sum()
    
print('Accuracy %d / %d = %f' % (correct, total, correct.item() / total))

Accuracy 1261 / 1629 = 0.774095
