# Titanic数据集

### 数据字典
#### 变量            定义            键。
+ survival        生存            0=否，1=是。
+ pclass          车票            1=1，2=2，3=3。
+ sex             性别
+ Age         年龄(以年为单位)
+ sibsp   泰坦尼克号上的兄弟姐妹/配偶#
+ parch   在泰坦尼克号上的父母/孩子们
+ ticket          车票号码
+ fare            票价乘客票价
+ cabin           客舱编号
+ embarked        登陆港          C=瑟堡，Q=皇后镇，S=南安普敦。
>
>
+ 注释
  + PCLASS：社会经济地位的代表(SES)
    + 第1个=上
    + 第2个=中
    + 第3个=更低

+ 年龄：
    + 小于1的年龄是小数。如果是估计的年龄，是否以xx.5的形式表示。

+ SIBSP：
    + 数据集以这种方式定义家庭关系……。
    + 兄弟姐妹=兄弟、姐妹、继兄弟、继姐妹。
    + 配偶=丈夫、妻子(情妇和未婚夫被忽略)。

+ Parch：数据集以这种方式定义家庭关系……。
    + 父母=母亲，父亲。
    + 孩子=女儿、儿子、继女、继子。
    + 一些孩子只和保姆一起旅行，因此对他们来说，parch=0。

### 读取数据

In [89]:
import pandas as pd
train_data=pd.read_csv('data/titanic/train.csv',sep=',')
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### 清洗数据归一化

In [90]:
import numpy as np
import torch
PassengerId=np.array(train_data['PassengerId'])
Pclass=np.array(train_data['Pclass']/3)
Sex=np.array(np.asarray(pd.get_dummies(pd.Categorical(train_data['Sex']).codes), dtype=np.int8))
#数据Age均数为27
Age=np.nan_to_num(np.array(train_data['Age']/100),nan=0.27)
SibSp=np.array(train_data['SibSp']/10)
Parch=np.array(train_data['Parch']/10)
Fare=np.array(train_data['Fare']/1000)
#Cabin
Survived=np.array(np.asarray(pd.get_dummies(train_data['Survived']), dtype=np.int8))

train_data=(torch.Tensor([Pclass[0:800],Sex[0:800,0],Sex[0:800,1],Age[0:800],SibSp[0:800],Parch[0:800]
                              ,Fare[0:800]]).T,torch.Tensor(Survived[0:800]))
experimental_data=(torch.Tensor([Pclass[800:],Sex[800:,0],Sex[800:,1],Age[800:],SibSp[800:],Parch[800:]
                              ,Fare[800:]]).T,torch.Tensor(Survived[800:]))
len(train_data[0]),len(train_data[0][0]),len(experimental_data[0]),len(experimental_data[0][0])

(800, 7, 91, 7)

### 数据抽样

In [91]:
#验证集90个
from torch.utils import data
train=data.DataLoader(data.TensorDataset(*train_data),10,shuffle=True)
experimental=data.DataLoader(data.TensorDataset(*experimental_data),10,shuffle=True)
next(iter(train)),next(iter(experimental))

([tensor([[1.0000, 1.0000, 0.0000, 0.1600, 0.0000, 0.0000, 0.0077],
          [1.0000, 0.0000, 1.0000, 0.0400, 0.3000, 0.2000, 0.0279],
          [0.6667, 0.0000, 1.0000, 0.2100, 0.2000, 0.0000, 0.0735],
          [1.0000, 0.0000, 1.0000, 0.4700, 0.0000, 0.0000, 0.0072],
          [0.3333, 0.0000, 1.0000, 0.5100, 0.0000, 0.1000, 0.0614],
          [1.0000, 0.0000, 1.0000, 0.3300, 0.0000, 0.0000, 0.0078],
          [0.3333, 0.0000, 1.0000, 0.5000, 0.1000, 0.0000, 0.0559],
          [0.3333, 1.0000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0287],
          [0.3333, 1.0000, 0.0000, 0.2700, 0.0000, 0.1000, 0.0550],
          [1.0000, 0.0000, 1.0000, 0.1900, 0.0000, 0.0000, 0.0068]]),
  tensor([[0., 1.],
          [1., 0.],
          [1., 0.],
          [1., 0.],
          [1., 0.],
          [1., 0.],
          [1., 0.],
          [1., 0.],
          [0., 1.],
          [1., 0.]])],
 [tensor([[1.0000, 0.0000, 1.0000, 0.3050, 0.0000, 0.0000, 0.0081],
          [1.0000, 1.0000, 0.0000, 0.1500, 0.10

### 建立模型
+ 线性模型

In [92]:
#将矩阵展平输入，输出10个
from torch import nn
net=nn.Sequential(nn.Linear(7,2))
def init_weight(m):
    if type(m)==nn.Linear:
        nn.init.normal_(m.weight,std=0.01)
net.apply(init_weight)

Sequential(
  (0): Linear(in_features=7, out_features=2, bias=True)
)

### 损失函数

In [93]:
loss=nn.CrossEntropyLoss(reduction="none")

### 梯度下降

In [94]:
### 学习率
lr=0.1
updater=torch.optim.SGD(net.parameters(),lr=lr)

## 训练

In [95]:
#将model设置为训练模式
if isinstance(net,torch.nn.Module):
    net.train()
for x,y in train:
    #预测
    y_guess=net(x)
    #损失
    l=loss(y_guess,y)
    #使用内置优化器和损失
    updater.zero_grad()
    l.mean().backward()
    updater.step()

## 验证

In [96]:
acc=0
allnum=0
for x,y in experimental:
    #print(net(x).argmax(axis=1),y.argmax(axis=1))
    cmp= net(x).argmax(axis=1)==y.argmax(axis=1)
    acc+=cmp.sum()
    allnum+=len(cmp)
print('验证正确率',np.array(acc/allnum*100),'%')  

验证正确率 76.92308 %


## 预测

In [97]:
test_data=pd.read_csv('data/titanic/test.csv',sep=',')

In [98]:
PassengerId=np.array(test_data['PassengerId'])
Pclass=np.array(test_data['Pclass']/3)
Sex=np.array(np.asarray(pd.get_dummies(pd.Categorical(test_data['Sex']).codes), dtype=np.int8))
#数据Age均数为27
Age=np.nan_to_num(np.array(test_data['Age']/100),nan=0.27)
SibSp=np.array(test_data['SibSp']/10)
Parch=np.array(test_data['Parch']/10)
Fare=np.array(test_data['Fare']/1000)
#Cabin

test_data=torch.Tensor([Pclass,Sex[:,0],Sex[:,1],Age,SibSp,Parch,Fare]).T

In [99]:
with open('answer/Titanic.csv','w') as file:
    file.write('PassengerId,Survived\n')
    for i in range(len(test_data)):
        file.write(str(int(np.array(PassengerId)[i]))+','+str(np.array(net(test_data[i]).argmax(axis=0)))+'\n')