# 数据分析

### datas 数据字段
- PassengerId 乘客ID
- Survived 是否幸存 0 = No, 1 = Yes
- Pclass 船票类型 1 = 1st, 2 = 2nd, 3 = 3rd
- Name 乘客姓名
- Sex 乘客性别
- Age 乘客年龄
- SibSp 船上的兄弟姊妹/配偶数量
- Parch 船上的父母/孩子数量
- Ticket 船票号

## 提出问题
 - 是哪些因素决定了乘客的生存与否，分别是正向的还是负向的，影响力多大。

In [6]:
# 引入需要的模块，读入数据

import numpy as np
import pandas as pd

DATA_PATH = "datas.csv"

datas = pd.read_csv(DATA_PATH)

## 基本统计学值

In [28]:
######### 公共函数 ##########

def add_line(df, name, func, *args):
    """ 添加一列 """
    df[name] = df.apply(func, axis=1)

    return df

def get_generation(row):
    """ 获取年龄段 """
    age = row.Age
    
    if 0 <= age < 10:
        return "0~10"
    elif 10 <= age < 20:
        return "10~20"
    elif 20 <= age < 30:
        return "20~30"
    elif 30 <= age < 40:
        return "30~40"
    elif 40 <= age < 50:
        return "40~50"
    elif 50 <= age < 60:
        return "50~60"
    elif 60 <= age < 70:
        return "60~70"
    elif 70 <= age < 80:
        return "70~80"
    elif 80 <= age < 90:
        return "80~90"
    elif 90 <= age < 100:
        return "90~100"
    elif 100 <= age:
        return "100+"
    else:
        return "unknow"
    

In [115]:
total = len(datas)

#### 男女比例 ####
m_f_count = datas["Sex"].value_counts()
print("1.男女比例为：{:.2f} : {:.2f}\n".format(m_f_count["male"] / m_f_count["male"], m_f_count["female"] / m_f_count["male"]))

#### 各个年龄数值 ####
# 添加年龄段统计列
datas = add_line(datas, 'Generation', get_generation)

Generation_count = datas["Generation"].value_counts()
print("2.各个年龄段乘客频数：")
print(Generation_count, "\n")

#### 平均年龄 ####
avg_age = datas["Age"].mean(skipna=True)
print("3.平均年龄：",int(avg_age), "\n")

#### 平均船上兄弟姊妹/配偶数量 ####
avg_sil = datas["SibSp"].mean(skipna=True)
print("4.平均船上兄弟姊妹/配偶数量：", avg_sil, "\n")

#### 平均船上父母/子女数量 ####
avg_parch = datas["Parch"].mean(skipna=True)
print("5.平均船上父母/子女数量：", avg_parch, "\n")

#### 船上各等级仓位比例 ####
pclass_count = datas["Pclass"].value_counts()
print("6.船上各等级仓位比例：")
print(pclass_count, "\n")

#### 生存率 ####
survived_count = datas["Survived"].value_counts()
print("7.所有乘客生存率：{:.2f}%\n".format((survived_count[1]/total)*100))

#### 男性生存率 ####
males = datas[datas.Sex=="male"]
males_survived_count = males["Survived"].value_counts()
print("8.男性乘客生存率：{:.2f}%\n".format((males_survived_count[1]/len(males))*100))

#### 女性生存率 ####
females = datas[datas.Sex=="female"]
females_survived_count = females["Survived"].value_counts()
print("9.女性乘客生存率：{:.2f}%\n".format((females_survived_count[1]/len(females))*100))

#### 各个年龄段的生存率 ####
Generation_group = datas.groupby("Generation")
print("10. 各个年龄段的生存率：")
for gen in Generation_group:
    this_gen = gen[1]
    gen_survived_count = this_gen["Survived"].value_counts()
    try:
        gen_survived_num = gen_survived_count[1]
    except:
        gen_survived_num = 0
    print("{}: {:.2f}%".format(gen[0], (gen_survived_num/len(this_gen))*100))
print("")
#### 各个等级仓位的生存率 ####
pclass_group = datas.groupby("Pclass")
print("11.各个等级仓位的生存率：")
for pc in pclass_group:
    this_pc = pc[1]
    gen_survived_count = this_pc["Survived"].value_counts()
    try:
        pc_survived_num = gen_survived_count[1]
    except:
        pc_survived_num = 0
    print("{}: {:.2f}%".format(pc[0], (pc_survived_num/len(this_pc))*100))
print("")

#### 独自上船的乘客（无兄弟姊妹/配偶/父母/子女在船上的乘客）的生存率 ####
alones = datas[(datas.SibSp==0) & (datas.Parch==0)]
alones_survived_count = alones["Survived"].value_counts()
print("12.独自上船的乘客生存率：{:.2f}%\n".format((alones_survived_count[1]/len(alones))*100))

#### 有同伴上船的乘客（无兄弟姊妹/配偶/父母/子女在船上的乘客）的生存率 ####
not_alones = datas[(datas.SibSp!=0) | (datas.Parch!=0)]
not_alones_survived_count = not_alones["Survived"].value_counts()
print("13.有同伴上船的乘客生存率：{:.2f}%\n".format((not_alones_survived_count[1]/len(not_alones))*100))


1.男女比例为：1.00 : 0.54

2.各个年龄段乘客频数：
20~30     220
unknow    177
30~40     167
10~20     102
40~50      89
0~10       62
50~60      48
60~70      19
70~80       6
80~90       1
Name: Generation, dtype: int64 

3.平均年龄： 29 

4.平均船上兄弟姊妹/配偶数量： 0.5230078563411896 

5.平均船上父母/子女数量： 0.38159371492704824 

6.船上各等级仓位比例：
3    491
1    216
2    184
Name: Pclass, dtype: int64 

7.所有乘客生存率：38.38%

8.男性乘客生存率：18.89%

9.女性乘客生存率：74.20%

10. 各个年龄段的生存率：
0~10: 61.29%
10~20: 40.20%
20~30: 35.00%
30~40: 43.71%
40~50: 38.20%
50~60: 41.67%
60~70: 31.58%
70~80: 0.00%
80~90: 100.00%
unknow: 29.38%

11.各个等级仓位的生存率：
1: 62.96%
2: 47.28%
3: 24.24%

12.独自上船的乘客生存率：30.35%

13.有同伴上船的乘客生存率：50.56%



男女比例为：1.0 : 0.5441941074523396


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,generation
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,
