In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
print(train_data.shape, test_data.shape)

(891, 12) (418, 11)


In [4]:
print(train_data.iloc[0:4, :])
print([c for c in train_data])
print([c for c in test_data])

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'Si

In [5]:
all_features = pd.concat((train_data.iloc[:, 2:], test_data.iloc[:, 1:]))
# 保留除Survived标签与ID以外的特征

In [6]:
print(all_features.shape)
print(all_features.iloc[:4, :])

(1309, 10)
   Pclass                                               Name     Sex   Age  \
0       3                            Braund, Mr. Owen Harris    male  22.0   
1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
2       3                             Heikkinen, Miss. Laina  female  26.0   
3       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   

   SibSp  Parch            Ticket     Fare Cabin Embarked  
0      1      0         A/5 21171   7.2500   NaN        S  
1      1      0          PC 17599  71.2833   C85        C  
2      0      0  STON/O2. 3101282   7.9250   NaN        S  
3      1      0            113803  53.1000  C123        S  


In [7]:
all_labels = train_data.iloc[:, 1]

In [8]:
# Extracting Numeric features
numeric_index = all_features.dtypes[all_features.dtypes != 'object'].index

# Feature Normalization: E = 0, S2 = 1
all_features[numeric_index] = all_features[numeric_index].apply(lambda x: (x - x.mean()) / x.std())

In [9]:
print(all_features.iloc[:4, :])

     Pclass                                               Name     Sex  \
0  0.841595                            Braund, Mr. Owen Harris    male   
1 -1.545507  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2  0.841595                             Heikkinen, Miss. Laina  female   
3 -1.545507       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   

        Age     SibSp     Parch            Ticket      Fare Cabin Embarked  
0 -0.546789  0.481104 -0.444829         A/5 21171 -0.503210   NaN        S  
1  0.563282  0.481104 -0.444829          PC 17599  0.733941   C85        C  
2 -0.269271 -0.478904 -0.444829  STON/O2. 3101282 -0.490169   NaN        S  
3  0.355144  0.481104 -0.444829            113803  0.382632  C123        S  


In [10]:
# Fill N/A with 0 if data is unbiased
all_features[numeric_index] = all_features[numeric_index].fillna(0)

In [11]:
# Set dummy features
all_features = pd.get_dummies(all_features, dummy_na=True)

In [12]:
print(all_features.shape)

(1309, 2437)


In [13]:
# Extract Numpy Metrix from Pandas Table
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(all_labels.values.reshape(-1, 1), dtype=torch.int64)

In [14]:
train_labels[:10]

tensor([[0],
        [1],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1]])

In [15]:
# Define Module components
loss = nn.CrossEntropyLoss(reduction='none')
dim_inputs = train_features.shape[1]  # The feature number of one sample
net = nn.Sequential(nn.Linear(dim_inputs, 2))


def init_weight(N):
    if isinstance(N, nn.Linear):
        nn.init.normal_(N.weight, std=0.01)


net.apply(init_weight)

Sequential(
  (0): Linear(in_features=2437, out_features=2, bias=True)
)

In [16]:
def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = (y_hat.type(y.dtype) == y)
    return float(cmp.type(y.dtype).sum())

In [17]:
def evaluate(net, data_iter):
    if isinstance(net, nn.Module):
        net.eval()
    metric = d2l.Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

In [18]:
def train(net, train_features, train_labels, test_features, test_labels,
         num_epoches, lr, weight_dacay, batch_size):
    train_loss, train_acc, test_acc = [], [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)
    optim = torch.optim.SGD(net.parameters(), lr=lr, weight_decay=weight_dacay)
    for epoch in range(num_epoches):
        metric = d2l.Accumulator(3)
        net.train()
        for X, y in train_iter:
            optim.zero_grad()
            y_hat = net(X)
            l = loss(y_hat, y.reshape(-1))
            l.sum().backward()
            optim.step()
            with torch.no_grad():
                metric.add(l.mean() * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            train_acc.append(metric[1] / metric[2])
            train_loss.append(metric[0] / metric[2])
        if test_labels is not None:
            test_iter = d2l.load_array((test_features, test_labels), batch_size)
            test_acc.append(evaluate(net, test_iter))
    return train_acc, test_acc, train_loss

In [19]:
def fold_slice(k, fold_idx, X, y):
    assert k > 1
    X_train, y_train, X_valid, y_valid = None, None, None, None
    fold_size = X.shape[0] // k
    for i in range(k):
        idx = slice(i * fold_size, (i + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if i == fold_idx:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.concat([X_train, X_part], 0)
            y_train = torch.concat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

In [20]:
def k_fold_train(net, features, labels, num_epoches, lr, weight_decay, batch_size, k):
    """Train k fold on training dataset"""
    train_acc_sum, valid_acc_sum = 0, 0
    for fold in range(k):
        train_features, train_labels, test_features, test_labels = fold_slice(k, fold, features, labels)
        train_acc, test_acc, train_loss = train(net, train_features, train_labels, 
                                                test_features, test_labels, num_epoches, 
                                                lr, weight_decay, batch_size)
        train_acc_sum += train_acc[-1]
        valid_acc_sum += test_acc[-1]
        print(f"第{fold + 1}折：训练精度 {float(train_acc[-1]):f}, 训练损失 {float(train_loss[-1]):f}, 测试精度 {float(test_acc[-1]):f}")
    return train_acc_sum / k, valid_acc_sum / k

In [21]:
k = 5
num_epoches = 100
lr = 0.1
weight_decay = 2
batch_size = 32
train_acc, valid_acc = k_fold_train(net, train_features, train_labels, num_epoches, lr, weight_decay, batch_size, k)
print(f"{k} 折交叉验证，平均训练精度 {float(train_acc):f}，平均验证精度 {float(valid_acc):f}")

第1折：训练精度 16.553371, 训练损失 29.061514, 测试精度 17.483146
第2折：训练精度 17.171348, 训练损失 26.397211, 测试精度 16.876404
第3折：训练精度 16.926966, 训练损失 26.275846, 测试精度 16.707865
第4折：训练精度 17.061798, 训练损失 27.176772, 测试精度 16.393258
第5折：训练精度 16.553371, 训练损失 37.496035, 测试精度 17.943820
5 折交叉验证，平均训练精度 16.853371，平均验证精度 17.080899
