In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import csv

In [25]:
import warnings
warnings.filterwarnings(action='ignore')

In [26]:
def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    acc_rate = 100 * float(acc.sum()) / a.size
    print '%s正确率: %.3f%%' % (tip, acc_rate)
    return acc_rate

In [27]:
def load_data(file_name, is_train):
    data = pd.read_csv(file_name)
    pd.set_option('display.width', 200)
    print 'data.describe()=\n', data.describe()
    
    # [性别] 转换为分类变量（可计算）
    data['Sex'] = pd.Categorical(data['Sex']).codes
    
    # 补齐船 [票价] 格缺失值
    if len(data.Fare[data.Fare == 0]) > 0:
        # 计算不同社会层级各自的平均票价
        fare = np.zeros(3)
        for f in range(0,3):
            fare[f] = data[data['Pclass'] == f + 1]['Fare'].dropna().median()
        print fare
        # 按社会层级不同，补充缺失值
        for i in range(0, 3):
            data.loc[(data.Fare == 0) & (data.Pclass == f+1), 'Fare'] = fare[f]
            
    print 'data.describe() =\n', data.describe()
    
    # [年龄] 使用均值代替缺失值
    # mean_age = data['Age'].dropna().mean()
    # data.loc[(data.Age.isnull()), 'Age'] = mean_age
    if is_train:
        # [年龄] 使用随机森林预测年龄缺失值
        print '随机森林预测缺失年龄：--start--'
        data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]
        age_null = data_for_age.loc[(data.Age.isnull())]
        print age_exist
        
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=20)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print '随机森林预测缺失年龄：--over--'
    else:
        print '随机森林预测缺失年龄2：--start--'
        data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]
        age_null = data_for_age.loc[(data.Age.isnull())]
        
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print '随机森林预测缺失年龄2：--over--'
    data['Age'] = pd.cut(data['Age'], bins=6, labels=np.arange(6))
    
    # [起始城市] 缺失值打上标记，然后转换为分类变量
    data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'
    embarked_data = pd.get_dummies(data.Embarked)
    print 'embarked_data =', embarked_data
    # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
    embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
    data = pd.concat([data, embarked_data], axis=1)
    print u'data.describe() =', data.describe()
    
    data.to_csv('New_Data.csv')
    
    x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    y = None
    if 'Survived' in data:
        y = data['Survived']
        
    x = np.array(x)
    y = np.array(y)
    
    # 这里在做啥
    x = np.tile(x, (5,1))
    y = np.tile(y, (5,))
    if is_train:
        return x, y
    return x, data['PassengerId']

In [28]:
def write_result(c, c_type):
    file_name = 'Titanic.test.csv'
    x, passenger_id = load_data(file_name, False)
    
    if c_type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0
    
    predictions_file = open('Prediction_%d.csv' % c_type, 'wb')
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(['PassengerId', 'Survived'])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()

## 读取数据

In [29]:
x, y = load_data('Titanic.train.csv', True)
print 'x =', x

x_train, x_test, y_train, y_test = train_test_split(x, y, \
                                                   test_size=0.25, \
                                                   random_state=1)

data.describe()=
       PassengerId    Survived      Pclass         Age       SibSp       Parch        Fare
count   891.000000  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean    446.000000    0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std     257.353842    0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min       1.000000    0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%     223.500000    0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%     446.000000    0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%     668.500000    1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max     891.000000    1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
[ 60.2875  14.25     8.05  ]
data.describe() =
       PassengerId    Survived      Pclass         Sex         Age       SibSp       Parch        Fare
count   891.00

## 三种分类模型预测

In [30]:
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train)
y_hat = lr.predict(x_test)
lr_acc = accuracy_score(y_test, y_hat)
# write_result(lr, 1)

In [31]:
rfc = RandomForestRegressor(n_estimators=100)
rfc.fit(x_train, y_train)
y_hat = rfc.predict(x_test).astype(np.int)
rfc_acc = accuracy_score(y_test, y_hat)
# write_result(rfc, 2)

In [32]:
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {
    'max_depth': 6,
    'eta': 0.8,
    'silent': 1,
    'objective': 'binary:logistic'
}
bst = xgb.train(param, data_train, num_boost_round=20, evals=watch_list)
y_hat = bst.predict(data_test)
# write_result(bst, 3)
y_hat[y_hat > 0.5] = 1
y_hat[~(y_hat > 0.5)] = 0
xgb_acc = accuracy_score(y_test, y_hat)

[0]	eval-error:0.13465	train-error:0.130201
[1]	eval-error:0.126571	train-error:0.102963
[2]	eval-error:0.106822	train-error:0.0871
[3]	eval-error:0.105027	train-error:0.081712
[4]	eval-error:0.105925	train-error:0.07842
[5]	eval-error:0.09605	train-error:0.071236
[6]	eval-error:0.091562	train-error:0.06974
[7]	eval-error:0.087074	train-error:0.06525
[8]	eval-error:0.087074	train-error:0.06525
[9]	eval-error:0.087971	train-error:0.064951
[10]	eval-error:0.087074	train-error:0.063753
[11]	eval-error:0.085278	train-error:0.062855
[12]	eval-error:0.087971	train-error:0.060461
[13]	eval-error:0.087074	train-error:0.06076
[14]	eval-error:0.083483	train-error:0.058964
[15]	eval-error:0.078097	train-error:0.057767
[16]	eval-error:0.078995	train-error:0.058964
[17]	eval-error:0.076302	train-error:0.056869
[18]	eval-error:0.076302	train-error:0.056869
[19]	eval-error:0.070916	train-error:0.054175


In [33]:
print 'Logistic回归: %.3f%%' % (100 * lr_acc)
print '随机森林: %.3f%%' % (100 * rfc_acc)
print 'XGBoost: %.3f%%' % (100 * xgb_acc)

Logistic回归: 79.892%
随机森林: 83.842%
XGBoost: 92.908%
