In [155]:
import numpy as np
import pandas as pd
from pandas import  DataFrame
from patsy import dmatrices
import string
from operator import itemgetter
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.externals import joblib

from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  #中文显示问题
data = pd.read_csv('train.csv')

In [156]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [157]:
data.loc[(data.Cabin.notnull()),'Cabin'] = 'Yes'
data.loc[(data.Cabin.isnull()),'Cabin'] = 'NO'

In [159]:
data['Cabin'].value_counts()

NO     687
Yes    204
Name: Cabin, dtype: int64

In [160]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if string.find(big_string, substring) != -1:
            return substring
    print big_string
    return np.nan

In [161]:
#处理一下名字，生成Title字段
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
            'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
            'Don', 'Jonkheer']
data['Title']=data['Name'].map(lambda x: substrings_in_string(x, title_list))

#处理特殊的称呼，全处理成mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Master']:
        return 'Master'
    elif title in ['Countess', 'Mme','Mrs']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms','Miss']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    elif title =='':
        if x['Sex']=='Male':
            return 'Master'
        else:
            return 'Miss'
    else:
        return title

data['Title']=data.apply(replace_titles, axis=1)

In [162]:
data['Age_avg']=data['Age']
mean_ages = np.zeros(4)
mean_ages[0]=np.average(data[data['Title'] == 'Miss']['Age'].dropna())
mean_ages[1]=np.average(data[data['Title'] == 'Mrs']['Age'].dropna())
mean_ages[2]=np.average(data[data['Title'] == 'Mr']['Age'].dropna())
mean_ages[3]=np.average(data[data['Title'] == 'Master']['Age'].dropna())
data.loc[ (data.Age.isnull()) & (data.Title == 'Miss') ,'Age_avg'] = mean_ages[0]
data.loc[ (data.Age.isnull()) & (data.Title == 'Mrs') ,'Age_avg'] = mean_ages[1]
data.loc[ (data.Age.isnull()) & (data.Title == 'Mr') ,'Age_avg'] = mean_ages[2]
data.loc[ (data.Age.isnull()) & (data.Title == 'Master') ,'Age_avg'] = mean_ages[3]

In [163]:
data['child'] = data['Age_avg']

In [164]:
data.loc[(data.child<=12),'child'] = 1
data.loc[(data.child>12),'child'] = 0

In [165]:
data.loc[(data.Fare.isnull()),'Fare'] = 0

In [166]:
#看看家族是否够大，咳咳
data['Family_Size']=data['SibSp']+data['Parch']
data['Family']=data['SibSp']*data['Parch']
data['Fare_Per_Person']=data['Fare']/(data['Family_Size']+1)

In [167]:
data['mother'] = data['Parch']

In [168]:
data.loc[(data.Parch>=2) & (data.Title == 'Mrs'),'mother'] = 1
data.loc[(data.Parch<2) | (data.Title != 'Mrs'), 'mother'] = 0

In [169]:
data['mother'].value_counts()

0    867
1     24
Name: mother, dtype: int64

In [170]:
data['Sex_Pclass'] = data.Sex + "_" + data.Pclass.map(str)

In [171]:
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
Fare_Per_Person_scale_param = scaler.fit(data['Fare_Per_Person'])
data['Fare_Per_Person_scale'] = scaler.fit_transform(data['Fare_Per_Person'], Fare_Per_Person_scale_param)
Age_avg_scale_param = scaler.fit(data['Age_avg'])
data['Age_avg_scale'] = scaler.fit_transform(data['Age_avg'], Age_avg_scale_param)
Age_avg_scale_param = scaler.fit(data['Age_avg'])
data['Age_avg_scale'] = scaler.fit_transform(data['Age_avg'], Age_avg_scale_param)




In [172]:

dummies_Sex = pd.get_dummies(data['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data['Pclass'], prefix= 'Pclass')
dummies_child = pd.get_dummies(data['child'], prefix= 'child')
dummies_mother = pd.get_dummies(data['mother'], prefix= 'mother')
dummies_Sex_Pclass = pd.get_dummies(data['Sex_Pclass'], prefix= 'Sex_Pclass')
data = pd.concat([data,dummies_Sex,dummies_Pclass,dummies_child,dummies_mother,dummies_Sex_Pclass],axis=1)

In [173]:
dummies_Title  = pd.get_dummies(data['Title'], prefix= 'Title')
data = pd.concat([data,dummies_Title],axis=1)

In [174]:
dummies_Cabin = pd.get_dummies(data['Cabin'], prefix= 'Cabin')
data = pd.concat([data,dummies_Cabin],axis=1)

In [175]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Sex_Pclass_female_3,Sex_Pclass_male_1,Sex_Pclass_male_2,Sex_Pclass_male_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Cabin_NO,Cabin_Yes
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,0,0,0,1,0,0,1,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,0,0,0,0,1,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,1,0,0,0,0,1,0,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,0,0,0,0,0,0,0,1,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,0,0,0,1,0,0,1,0,1,0
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,...,0,0,0,1,0,0,1,0,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,...,0,1,0,0,0,0,1,0,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,...,0,0,0,1,1,0,0,0,1,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,...,1,0,0,0,0,0,0,1,1,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,...,0,0,0,0,0,0,0,1,1,0


In [197]:
from sklearn import linear_model

train_df = data.filter(regex='Survived|SibSp|Parch|Cabin_.*|Title_.*|Family_Size|Family|Fare_Per_Person_scale|Age_avg_scale \
                              Sex_.*|Pclass_.*|child_.*|mother_.*|Sex_Pclass_.*')
train_np = train_df.as_matrix()


train_y = train_np[:,0]

# X即特征属性值
train_x = train_np[:,1:]


clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(train_x, train_y)
train_df.shape
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
Survived                 891 non-null int64
SibSp                    891 non-null int64
Parch                    891 non-null int64
Family_Size              891 non-null int64
Family                   891 non-null int64
Fare_Per_Person_scale    891 non-null float64
Pclass_1                 891 non-null uint8
Pclass_2                 891 non-null uint8
Pclass_3                 891 non-null uint8
child_0.0                891 non-null uint8
child_1.0                891 non-null uint8
mother_0                 891 non-null uint8
mother_1                 891 non-null uint8
Sex_Pclass_female_1      891 non-null uint8
Sex_Pclass_female_2      891 non-null uint8
Sex_Pclass_female_3      891 non-null uint8
Sex_Pclass_male_1        891 non-null uint8
Sex_Pclass_male_2        891 non-null uint8
Sex_Pclass_male_3        891 non-null uint8
Title_Master             891 non-null uint8
Title_Miss   

In [178]:
data_test = pd.read_csv('test.csv')

In [179]:
data_test['Title']=data_test['Name'].map(lambda x: substrings_in_string(x, title_list))
data_test['Title']=data_test.apply(replace_titles, axis=1)

In [180]:
data_test['Age_avg']=data_test['Age']
mean_ages = np.zeros(4)
mean_ages[0]=np.average(data_test[data_test['Title'] == 'Miss']['Age'].dropna())
mean_ages[1]=np.average(data_test[data_test['Title'] == 'Mrs']['Age'].dropna())
mean_ages[2]=np.average(data_test[data_test['Title'] == 'Mr']['Age'].dropna())
mean_ages[3]=np.average(data_test[data_test['Title'] == 'Master']['Age'].dropna())
data_test.loc[ (data_test.Age.isnull()) & (data_test.Title == 'Miss') ,'Age_avg'] = mean_ages[0]
data_test.loc[ (data_test.Age.isnull()) & (data_test.Title == 'Mrs') ,'Age_avg'] = mean_ages[1]
data_test.loc[ (data_test.Age.isnull()) & (data_test.Title == 'Mr') ,'Age_avg'] = mean_ages[2]
data_test.loc[ (data_test.Age.isnull()) & (data_test.Title == 'Master') ,'Age_avg'] = mean_ages[3]

data_test['child'] = data_test['Age_avg']

data_test.loc[(data_test.child<=12),'child'] = 1
data_test.loc[(data_test.child>12),'child'] = 0

data_test.loc[(data_test.Cabin.notnull()),'Cabin'] = 'Yes'
data_test.loc[(data_test.Cabin.isnull()),'Cabin'] = 'NO'

data_test.loc[(data_test.Fare.isnull()),'Fare'] = 0

#看看家族是否够大，咳咳
data_test['Family_Size']=data_test['SibSp']+data_test['Parch']
data_test['Family']=data_test['SibSp']*data_test['Parch']
data_test['Fare_Per_Person']=data_test['Fare']/(data_test['Family_Size']+1)

data_test['mother'] = data_test['Parch']
data_test.loc[(data_test.Parch>=2) & (data_test.Title == 'Mrs'),'mother'] = 1
data_test.loc[(data_test.Parch<2) | (data_test.Title != 'Mrs'), 'mother'] = 0
data_test['Sex_Pclass'] = data_test.Sex + "_" + data_test.Pclass.map(str)

In [181]:
Fare_Per_Person_scale_param = scaler.fit(data_test['Fare_Per_Person'])
data_test['Fare_Per_Person_scale'] = scaler.fit_transform(data_test['Fare_Per_Person'], Fare_Per_Person_scale_param)
Age_avg_scale_param = scaler.fit(data_test['Age_avg'])
data_test['Age_avg_scale'] = scaler.fit_transform(data_test['Age_avg'], Age_avg_scale_param)
Age_avg_scale_param = scaler.fit(data_test['Age_avg'])
data_test['Age_avg_scale'] = scaler.fit_transform(data_test['Age_avg'], Age_avg_scale_param)



In [182]:
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
dummies_child = pd.get_dummies(data_test['child'], prefix= 'child')
dummies_mother = pd.get_dummies(data_test['mother'], prefix= 'mother')
dummies_Sex_Pclass = pd.get_dummies(data_test['Sex_Pclass'], prefix= 'Sex_Pclass')
dummies_Title  = pd.get_dummies(data_test['Title'], prefix= 'Title')
dummies_Cabin  = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
data_test = pd.concat([data_test,dummies_Sex,dummies_Pclass,dummies_child,dummies_mother,dummies_Sex_Pclass,dummies_Title,dummies_Cabin],axis=1)

In [183]:
data_test.info(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 42 columns):
PassengerId              418 non-null int64
Pclass                   418 non-null int64
Name                     418 non-null object
Sex                      418 non-null object
Age                      332 non-null float64
SibSp                    418 non-null int64
Parch                    418 non-null int64
Ticket                   418 non-null object
Fare                     418 non-null float64
Cabin                    418 non-null object
Embarked                 418 non-null object
Title                    418 non-null object
Age_avg                  418 non-null float64
child                    418 non-null float64
Family_Size              418 non-null int64
Family                   418 non-null int64
Fare_Per_Person          418 non-null float64
mother                   418 non-null int64
Sex_Pclass               418 non-null object
Fare_Per_Person_scale    418 non-null floa

In [198]:
test_df = data_test.filter(regex='SibSp|Parch|Cabin_.*|Title_.*|Family_Size|Family|Fare_Per_Person_scale|Age_avg_scale \
                              Sex_.*|Pclass_.*|child_.*|mother_.*|Sex_Pclass_.*')

test_x = test_df.as_matrix()


predictions = clf.predict(test_x)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv('lr_prediction_2.csv',index=False)


In [204]:
from sklearn.ensemble import BaggingRegressor


bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X, y)

predictions = bagging_clf.predict(test_x)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions3.csv", index=False)


In [205]:
train_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 43 columns):
PassengerId              891 non-null int64
Survived                 891 non-null int64
Pclass                   891 non-null int64
Name                     891 non-null object
Sex                      891 non-null object
Age                      714 non-null float64
SibSp                    891 non-null int64
Parch                    891 non-null int64
Ticket                   891 non-null object
Fare                     891 non-null float64
Cabin                    891 non-null object
Embarked                 889 non-null object
Title                    891 non-null object
Age_avg                  891 non-null float64
child                    891 non-null float64
Family_Size              891 non-null int64
Family                   891 non-null int64
Fare_Per_Person          891 non-null float64
mother                   891 non-null int64
Sex_Pclass               891 non-null objec

In [112]:
test_df1 = data_test.filter(regex='SibSp|Parch|Cabin|Title_.*|Family|Fare_Per_Person_scale|Age_avg_scale \
                              Sex_.*|Pclass_.*|child|mother|Sex_Pclass_.*')
test_x1 = test_df1.as_matrix()
test_x1.dtype

dtype('float64')

In [113]:
predictions = bagging_clf.predict(test_x1)
