In [1]:
import os
import datetime

#打印时间
def printbar():
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("\n"+"=========="*8 + "%s"%nowtime)

#mac系统上pytorch和matplotlib在jupyter中同时跑需要更改环境变量
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" 

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset

# raw数据加载
dftrain_raw = pd.read_csv('../data/titanic/train.csv')
dftest_raw = pd.read_csv('../data/titanic/test.csv')
dftrain_raw.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,493,0,1,"Molson, Mr. Harry Markland",male,55.0,0,0,113787,30.5,C30,S
1,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
2,388,1,2,"Buss, Miss. Kate",female,36.0,0,0,27849,13.0,,S
3,192,0,2,"Carbines, Mr. William",male,19.0,0,0,28424,13.0,,S
4,687,0,3,"Panula, Mr. Jaako Arnold",male,14.0,4,1,3101295,39.6875,,S
5,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S
6,228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S
7,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
8,168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45.0,1,4,347088,27.9,,S
9,752,1,3,"Moor, Master. Meier",male,6.0,0,1,392096,12.475,E121,S


In [None]:
# EDA
%matplotlib inline
%config InlineBackend.figure_format = 'png'
ax = dftrain_raw['Survived'].value_counts().plot(kind = 'bar', figsize = (12,8), fontsize=15, rot = 0)
ax.set_ylabel('Counts', fontsize = 15)
ax.set_xlabel('Survived', fontsize = 15)
plt.show()

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
ax = dftrain_raw['Age'].plot(kind = 'hist', bins = 20, color= 'purple', figsize = (12,8), fontsize=15)

ax.set_ylabel('Frequency', fontsize = 15)
ax.set_xlabel('Age', fontsize = 15)
plt.show()

In [3]:
# raw数据预处理
def preprocessing(dfdata):

    dfresult= pd.DataFrame()

    #Pclass
    dfPclass = pd.get_dummies(dfdata['Pclass'])
    dfPclass.columns = ['Pclass_' +str(x) for x in dfPclass.columns ]
    dfresult = pd.concat([dfresult,dfPclass],axis = 1)

    #Sex
    dfSex = pd.get_dummies(dfdata['Sex'])
    dfresult = pd.concat([dfresult,dfSex],axis = 1)

    #Age
    dfresult['Age'] = dfdata['Age'].fillna(0)
    dfresult['Age_null'] = pd.isna(dfdata['Age']).astype('int32')

    #SibSp,Parch,Fare
    dfresult['SibSp'] = dfdata['SibSp']
    dfresult['Parch'] = dfdata['Parch']
    dfresult['Fare'] = dfdata['Fare']

    #Carbin
    dfresult['Cabin_null'] = pd.isna(dfdata['Cabin']).astype('int32')

    #Embarked
    dfEmbarked = pd.get_dummies(dfdata['Embarked'],dummy_na=True)
    dfEmbarked.columns = ['Embarked_' + str(x) for x in dfEmbarked.columns]
    dfresult = pd.concat([dfresult,dfEmbarked],axis = 1)

    return(dfresult)

x_train = preprocessing(dftrain_raw).values
y_train = dftrain_raw[['Survived']].values

x_test = preprocessing(dftest_raw).values
y_test = dftest_raw[['Survived']].values

print("x_train.shape =", x_train.shape )
print("x_test.shape =", x_test.shape )

print("y_train.shape =", y_train.shape )
print("y_test.shape =", y_test.shape )

x_train.shape = (712, 15)
x_test.shape = (179, 15)
y_train.shape = (712, 1)
y_test.shape = (179, 1)


In [4]:
# batch训练及验证数据
dl_train = DataLoader(
    TensorDataset(torch.tensor(x_train).float(), torch.tensor(y_train).float()), 
    shuffle = True, 
    batch_size = 8
)
dl_valid = DataLoader(
    TensorDataset(torch.tensor(x_test).float(), torch.tensor(y_test).float()), 
    shuffle = False, 
    batch_size = 8
)

# for features, labels in dl_train:
#     print(features, labels)
#     break

In [5]:
# sequence模型
def create_net():
    net = torch.nn.Sequential()
    net.add_module("linear1", torch.nn.Linear(15, 20))
    net.add_module("relu1", torch.nn.ReLU())
    net.add_module("linear2", torch.nn.Linear(20, 15))
    net.add_module("relu2", torch.nn.ReLU())
    net.add_module("linear3", torch.nn.Linear(15, 1))
    net.add_module("sigmoid", torch.nn.Sigmoid())
    return net
    
net = create_net()
print(net)

Sequential(
  (linear1): Linear(in_features=15, out_features=20, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=20, out_features=15, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=15, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [6]:
from sklearn.metrics import accuracy_score

# loss, optimizer, metrics
loss_func = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params = net.parameters(), lr = 0.01)
metric_func = lambda y_pred, y_true: accuracy_score(y_true.data.numpy(), y_pred.data.numpy() > 0.5)
metric_name = "accuracy"

In [7]:
epochs = 10
log_step_freq = 30

dfhistory = pd.DataFrame(columns = ["epoch", "loss", metric_name, "val_loss", "val_" + metric_name]) 
print("Start Training...")
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("==========" * 8 + "%s" % nowtime)

for epoch in range(1, epochs + 1):
    # 1，训练循环-------------------------------------------------
    net.train()
    loss_sum = 0.0
    metric_sum = 0.0
    step = 1
    
    for step, (features, labels) in enumerate(dl_train, 1):
        # 梯度清零
        optimizer.zero_grad()

        # 正向传播求损失
        predictions = net(features)
        loss = loss_func(predictions, labels)
        metric = metric_func(predictions, labels)
        
        # 反向传播求梯度
        loss.backward()
        optimizer.step()

        # 打印batch级别日志
        loss_sum += loss.item()
        metric_sum += metric.item()
        if step % log_step_freq == 0:   
            print(("[step = %d] loss: %.3f, " + metric_name + ": %.3f") % (step, loss_sum / step, metric_sum / step))
            
    # 2，验证循环-------------------------------------------------
    net.eval()
    val_loss_sum = 0.0
    val_metric_sum = 0.0
    val_step = 1

    for val_step, (features, labels) in enumerate(dl_valid, 1):
        # 关闭梯度计算
        with torch.no_grad():
            predictions = net(features)
            val_loss = loss_func(predictions, labels)
            val_metric = metric_func(predictions, labels)
        val_loss_sum += val_loss.item()
        val_metric_sum += val_metric.item()

    # 3，记录日志-------------------------------------------------
    info = (epoch, loss_sum / step, metric_sum / step, val_loss_sum / val_step, val_metric_sum / val_step)
    dfhistory.loc[epoch - 1] = info
    
    # 打印epoch级别日志
    print(("\nEPOCH = %d, loss = %.3f, " + metric_name + " = %.3f, val_loss = %.3f, " + "val_" + metric_name + " = %.3f") % info)
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("\n"+"=========="*8 + "%s" % nowtime)
        
print('Finished Training...')

Start Training...
[step = 30] loss: 0.641, accuracy: 0.650
[step = 60] loss: 0.618, accuracy: 0.650

EPOCH = 1, loss = 0.623, accuracy = 0.632, val_loss = 0.586, val_accuracy = 0.649

[step = 30] loss: 0.579, accuracy: 0.633
[step = 60] loss: 0.578, accuracy: 0.677

EPOCH = 2, loss = 0.576, accuracy = 0.683, val_loss = 0.553, val_accuracy = 0.676

[step = 30] loss: 0.537, accuracy: 0.762
[step = 60] loss: 0.544, accuracy: 0.744

EPOCH = 3, loss = 0.548, accuracy = 0.743, val_loss = 0.577, val_accuracy = 0.717

[step = 30] loss: 0.543, accuracy: 0.754
[step = 60] loss: 0.511, accuracy: 0.767

EPOCH = 4, loss = 0.541, accuracy = 0.753, val_loss = 0.453, val_accuracy = 0.777

[step = 30] loss: 0.510, accuracy: 0.758
[step = 60] loss: 0.522, accuracy: 0.773

EPOCH = 5, loss = 0.492, accuracy = 0.788, val_loss = 0.435, val_accuracy = 0.793

[step = 30] loss: 0.514, accuracy: 0.779
[step = 60] loss: 0.485, accuracy: 0.800

EPOCH = 6, loss = 0.484, accuracy = 0.795, val_loss = 0.424, val_accu

In [34]:
dfhistory

Unnamed: 0,epoch,loss,accuracy,val_loss,val_accuracy
0,1.0,0.463317,0.779494,0.391125,0.793478
1,2.0,0.432583,0.80618,0.420385,0.804348
2,3.0,0.436158,0.810393,0.431594,0.804348
3,4.0,0.443063,0.786517,0.435682,0.777174
4,5.0,0.433915,0.799157,0.403094,0.777174
5,6.0,0.452492,0.799157,0.401829,0.815217
6,7.0,0.435932,0.803371,0.436257,0.793478
7,8.0,0.443257,0.803371,0.413472,0.798913
8,9.0,0.444815,0.800562,0.451831,0.782609
9,10.0,0.435593,0.811798,0.419585,0.815217


In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import matplotlib.pyplot as plt

def plot_metric(dfhistory, metric):
    train_metrics = dfhistory[metric]
    val_metrics = dfhistory['val_' + metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics, 'bo--')
    plt.plot(epochs, val_metrics, 'ro-')
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_" + metric, 'val_' + metric])
    plt.show()

plot_metric(dfhistory, "loss")

In [8]:
y_pred_probs = net(torch.tensor(x_test[0:10]).float()).data
y_pred_probs

tensor([[0.4378],
        [0.7276],
        [0.5356],
        [0.9976],
        [0.7216],
        [0.9129],
        [0.2254],
        [0.9301],
        [0.6460],
        [0.2531]])

In [9]:
y_pred = torch.where(y_pred_probs > 0.5, torch.ones_like(y_pred_probs), torch.zeros_like(y_pred_probs))
y_pred

tensor([[0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.]])

In [32]:
print(net.state_dict().keys())

odict_keys(['linear1.weight', 'linear1.bias', 'linear2.weight', 'linear2.bias', 'linear3.weight', 'linear3.bias'])


In [33]:
torch.save(net.state_dict(), "./data/net_parameter.pkl")

net_clone = create_net()
net_clone.load_state_dict(torch.load("./data/net_parameter.pkl"))

net_clone.forward(torch.tensor(x_test[0:10]).float()).data

tensor([[0.1671],
        [0.5790],
        [0.3328],
        [0.7982],
        [0.5343],
        [0.9058],
        [0.2295],
        [0.9913],
        [0.4826],
        [0.2106]])