# 1. 分析输入数据的基本情况
- 数据类型：定类/定序/等差/等比，数值/非数值
- 数值的取值范围和分布
- 缺失值

In [1]:
train_csv = './data/train.csv'
test_csv = './data/test.csv'

In [2]:
import pandas as pd

df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [3]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
mod_df = df.mode().iloc[0]

In [7]:
# 查看存在nan值的列
for column in df.columns:
    c = len(df[df[column].isna()])
    print(f'{column}:{c}')

PassengerId:0
Survived:0
Pclass:0
Name:0
Sex:0
Age:177
SibSp:0
Parch:0
Ticket:0
Fare:0
Cabin:687
Embarked:2


In [8]:
df['Survived'].sum()

342

# 2.获取基线
- 根据类别占比计算基线1
- 简单特征工程和模型计算基线2

## （1）baseline1
- 使用数量多的类别作为预测结果

In [9]:
baseline1 = (len(df) - df['Survived'].sum()) / len(df)
print(baseline1)

0.6161616161616161


## （2）baseline2
- 简单的特征工程+模型获取基线

### 数据预处理
- 填充缺失值

In [10]:
df = df.fillna(mod_df)

### 特征工程
- Pclass: 转为one-hot
- Name: 字符串类型，先不使用该特征
- sex: 0-1
- Age: 浮点数，使用众数填充na
- SibSp: 浮点数
- Parch: 浮点数
- Ticket: 先不用
- Fare: 浮点数
- Cabin: 字符串类型，缺失值多，先不用
- Embarked: 转为one-hot，使用众数填充na

In [12]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

onehot_columns = ['Sex', 'Pclass', 'Embarked']
value_columns = ['Age', 'SibSp', 'Parch', 'Fare']

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[onehot_columns])

enc_names = enc.get_feature_names_out(onehot_columns).tolist()
fea_names = enc_names + value_columns

def baseline_fea(df):
    ori_onehot_fea = df[onehot_columns].values
    value_fea = df[value_columns].values
    
    onehot_fea = enc.transform(ori_onehot_fea).toarray()
    
    fea = np.concatenate([onehot_fea, value_fea], axis=1)
    
    return fea
    
    
fea = baseline_fea(df)
lab = df.Survived.values



### 模型训练
- 选择xgb二分类

In [18]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import random


index = list(range(len(fea)))
random.shuffle(index)
train_ratio = 0.8
train_fea = fea[index[:int(len(fea) * 0.8)]]
val_fea = fea[index[int(len(fea) * 0.8):]]
train_lab = lab[index[:int(len(fea) * 0.8)]]
val_lab = lab[index[int(len(fea) * 0.8):]]

train_data = xgb.DMatrix(train_fea, label=train_lab)
val_data = xgb.DMatrix(val_fea, label=val_lab)

param = {'max_depth': 4, 'eta': 0.3, 'objective': 'binary:logistic'}
num_round = 100


In [19]:
bst = xgb.train(param, train_data, num_round)



In [22]:
val_prob = bst.predict(val_data)

In [29]:
thres = 0.5
val_predict = np.array([1 if p >= thres else 0 for p in val_prob])
acc = (val_predict == val_lab).sum() / len(val_lab)
print(acc)

0.7932960893854749


In [30]:
# 测试集结果
test_df = test_df.fillna(mod_df)
test_fea = baseline_fea(test_df)
test_fea = xgb.DMatrix(test_fea)
test_prob = bst.predict(test_fea)

test_predict = np.array([1 if p >= thres else 0 for p in test_prob])



In [31]:
print(test_prob)

[0.04476339 0.01853532 0.06581522 0.38680032 0.10607986 0.09970482
 0.07255739 0.0109132  0.9338133  0.01177681 0.00629928 0.0725457
 0.98062193 0.05485411 0.95392275 0.9830035  0.04331405 0.45074102
 0.28768307 0.03187041 0.6214421  0.23131998 0.99838674 0.7515552
 0.9454073  0.04076429 0.9993747  0.61610794 0.7648462  0.5231955
 0.10891473 0.07808999 0.45231205 0.06838726 0.68820184 0.43530428
 0.09994216 0.03069982 0.1185584  0.39398468 0.17289448 0.36277404
 0.07909153 0.99200636 0.9826318  0.12935536 0.18154132 0.09578732
 0.9950008  0.72429353 0.41884598 0.1498711  0.9285936  0.64128274
 0.59156066 0.00488378 0.0157155  0.05147028 0.03061658 0.9983999
 0.02239395 0.34457585 0.03240509 0.90008444 0.58912075 0.9786189
 0.8918072  0.03085805 0.10522142 0.9029872  0.8589683  0.00609802
 0.43026605 0.07225567 0.99619675 0.54614496 0.02635033 0.9131548
 0.17348208 0.8589683  0.97507715 0.0561929  0.2573558  0.00629928
 0.12815185 0.07408544 0.8596726  0.71930903 0.8589683  0.94225883
 