In [None]:
# kaggle的titanic入门比赛，训练集和测试集都下载在本文件的同一目录

In [113]:
# 数据总体分析
import pandas as pd
train = pd.read_csv('train.csv')
train.head()
# Survived => 1生存 0遇难
# PassengerId => 乘客ID
# Survived => 获救情况（1为获救，0为未获救）
# Pclass => 乘客等级(1/2/3等舱位)
# Name => 乘客姓名
# Sex => 性别
# Age => 年龄
# SibSp => 堂兄弟/妹个数
# Parch => 父母与小孩个数
# Ticket => 船票信息
# Fare => 票价
# Cabin => 客舱
# Embarked => 登船港口

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.info()
# Age, Cabin缺失比较严重， Embarked缺失两个值，特征处理时要进行填充

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [33]:
train.describe()
# Survived的平均值为0.383，我们的预测准确率至少达到
# 1 - 0.383 = 0.617 才能接受，否则为什么不全部预测为 0 呢
# 一半乘客都是三等舱的 20到40岁的人占了一半 最大年龄为80
# SibSp和Parch两极分化严重，两个变量相似，考虑合并为一个

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train[['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']].describe()
# name是唯一的，应该跟生存关系不大 男性占了577/891=65% 
# 像Cabin，Ticket等文本类，我们要做的是提取信息，简化信息
# Ticket没有缺失但只有681个值，说明有人共用一张船票，可以把共用船票的人数构造为一个特征值
# Cabin的204个值中大多数不一样，类太多考虑合并

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Levy, Mr. Rene Jacques",male,1601,C23 C25 C27,S
freq,1,577,7,4,644


In [114]:
# 基于前面分析进行特征处理
# preprocess函数是通用的处理，之后还要针对模型进行特定处理
class PreProcessor:
    def fit(self, data):
        # 复用训练集的age_mean，fare_mean
        self.age_mean = data['Age'].mean()
        self.fare_mean = data['Fare'].mean()
        return self
        
    def tranform(self, data):
        # 均值填充Age
        data['Age'].fillna(self.age_mean, inplace=True)
        # 用‘S’填充
        data['Embarked'].fillna('S', inplace=True)
        # 测试数据的Fare有一个空值
        data['Fare'].fillna(self.fare_mean, inplace=True)
        # 合并SibSp和Patch列
        data['FamilyNum'] = data.SibSp + data.Parch
        # Cabin有值的为一类，无值的为一类
        data['CabinClass'] = data.Cabin.map(lambda x: 0 if pd.isna(x) else 1)
        # 多少人共用一张船票做为一个数值特征
        data['TicketNum'] = data.Ticket.map(dict(data.groupby('Ticket').PassengerId.count()))
        # 我们舍弃PassengerId,Name，其他的列已经做了处理
        data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], inplace=True)
        # 对分类特征one-hot,pandas虽然把Pclass当成数值特征，但我们应该清楚它属于分类特征
        return pd.get_dummies(
                    data,
                    columns=['Pclass', 'Sex', 'CabinClass', 'Embarked']
                    )

# 开始建立模型
# 再import一次pandas，是为了该代码块能作为一个py文件
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# 关掉一些版本变动提示
import warnings
warnings.filterwarnings('ignore')

# 读取数据集
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
result = {'PassengerId': test['PassengerId']}

# 处理特征
prepro = PreProcessor().fit(train)

train = prepro.tranform(train)
train_x = train.drop(columns=['Survived'])
train_x_std = normalize(train_x) #逻辑回归还需要标准化
train_y = train['Survived']

test_x = prepro.tranform(test)

# 初始化模型
lr = LogisticRegression()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()

# 输出10折交叉验证平均得分
print ('LinearRegression ', np.mean(cross_val_score(lr, train_x_std, train_y, cv=10)))
print ('DecisionTree ', np.mean(cross_val_score(dtc, train_x, train_y, cv=10)))
print ('RandomForest ', np.mean(cross_val_score(rfc, train_x, train_y, cv=10)))

LinearRegression  0.6915903416184316
DecisionTree  0.7846402224492113
RandomForest  0.8014561343774826


In [112]:
# lr得分惨不忍睹，我们调一下参数
for c in [0.01, 0.1, 1, 10, 100, 200, 300, 400]:
    lr = LogisticRegression(penalty='l1', C=c)
    print ('LinearRegression ', np.mean(cross_val_score(lr, train_x_std, train_y, cv=10)))

LinearRegression  0.6161701282487799
LinearRegression  0.6735115196912951
LinearRegression  0.7274716263761207
LinearRegression  0.7846396549767336
LinearRegression  0.7925053909885371
LinearRegression  0.793628986494155
LinearRegression  0.793628986494155
LinearRegression  0.793628986494155


In [107]:
# 调整决策树深度
for d in [1, 2, 3, 6, 7, 8, 9]:
    dtc = DecisionTreeClassifier()
    print ('DecisionTree ', np.mean(cross_val_score(dtc, train_x, train_y, cv=10)))

DecisionTree  0.7846654749744637
DecisionTree  0.7914076154806492
DecisionTree  0.7812824877993416
DecisionTree  0.7801208716377256
DecisionTree  0.7869254341164453
DecisionTree  0.7869004653274316
DecisionTree  0.7914073317444104


In [101]:
# 调整随机森林大小
for n in [120, 300, 500, 800]:
    rfc = RandomForestClassifier(n_estimators=n, max_depth=5)
    print ('RandomForest ', np.mean(cross_val_score(rfc, train_x, train_y, cv=10)))

RandomForest  0.8092089433662467
RandomForest  0.8148269208943366
RandomForest  0.8114686187719895
RandomForest  0.8103450232663716


In [115]:
# 根据交叉验证平均得分以及调参结果，最终选择训练n_etimators参数为300的rfc模型
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(train_x, train_y)
# 预测并保存结果
test_y = rfc.predict(test_x)
result['Survived'] = test_y
result = pd.DataFrame(result)
result.to_csv('submission.csv',index=False)

In [None]:
# 总结
# kaggle好像在中国没有服务器，总是出问题，submission.csv上传不了
# 不过没关系，过程比结果重要（这句话在数据挖掘方面怎么有点难说出口(￣▽￣)"）
# 数据分析没有标准答案，重要的是要有自己的一套方法，然后不断在实践中改进

In [None]:
#提供一份完整的代码，复制到py文件即可
class PreProcessor:
    def fit(self, data):
        # 复用训练集的age_mean，fare_mean
        self.age_mean = data['Age'].mean()
        self.fare_mean = data['Fare'].mean()
        return self
        
    def tranform(self, data):
        # 均值填充Age
        data['Age'].fillna(self.age_mean, inplace=True)
        # 用‘S’填充
        data['Embarked'].fillna('S', inplace=True)
        # 测试数据的Fare有一个空值
        data['Fare'].fillna(self.fare_mean, inplace=True)
        # 合并SibSp和Patch列
        data['FamilyNum'] = data.SibSp + data.Parch
        # Cabin有值的为一类，无值的为一类
        data['CabinClass'] = data.Cabin.map(lambda x: 0 if pd.isna(x) else 1)
        # 多少人共用一张船票做为一个数值特征
        data['TicketNum'] = data.Ticket.map(dict(data.groupby('Ticket').PassengerId.count()))
        # 我们舍弃PassengerId,Name，其他的列已经做了处理
        data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], inplace=True)
        # 对分类特征one-hot,pandas虽然把Pclass当成数值特征，但我们应该清楚它属于分类特征
        return pd.get_dummies(
                    data,
                    columns=['Pclass', 'Sex', 'CabinClass', 'Embarked']
                    )

# 开始建立模型
# 再import一次pandas，是为了该代码块能作为一个py文件
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# 关掉一些版本变动提示
import warnings
warnings.filterwarnings('ignore')

# 读取数据集
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
result = {'PassengerId': test['PassengerId']}

# 处理特征
prepro = PreProcessor().fit(train)

train = prepro.tranform(train)
train_x = train.drop(columns=['Survived'])
train_x_std = normalize(train_x) #逻辑回归还需要标准化
train_y = train['Survived']

test_x = prepro.tranform(test)

# 根据交叉验证平均得分以及调参结果，最终选择训练n_etimators参数为300的rfc模型
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(train_x, train_y)
# 预测并保存结果
test_y = rfc.predict(test_x)
result['Survived'] = test_y
result = pd.DataFrame(result)
result.to_csv('submission.csv',index=False)