In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import lightgbm as lgbm
from sklearn.model_selection import  GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier




This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_data = pd.read_csv("data/titanic/train.csv")
test_data = pd.read_csv("data/titanic/test.csv")

In [3]:
train_data.describe()
train_data.head()
train_data.info()
test_data.info()

train_corr = train_data.drop('PassengerId', axis=1).corr()
train_corr
a = plt.subplots(figsize = (15,9))
sns.heatmap(train_corr,vmin=-1, vmax=1,annot=True, square=True)


train_data.groupby("Pclass")['Survived'].mean().plot.bar()

train_data.groupby(['Sex'])['Sex','Survived'].mean().plot.bar()

g = sns.FacetGrid(train_data, col='Survived',size=5)
g.map(plt.hist, 'Age', bins=40)

g = sns.FacetGrid(train_data, col='Survived',size=5)
g.map(plt.hist, 'Fare', bins=40)
sns.countplot("Embarked", hue = 'Survived', data=train_data)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


<matplotlib.axes._subplots.AxesSubplot at 0x117667518>

<matplotlib.axes._subplots.AxesSubplot at 0x117667518>

<matplotlib.axes._subplots.AxesSubplot at 0x11771bb00>



<seaborn.axisgrid.FacetGrid at 0x11777ac18>



<seaborn.axisgrid.FacetGrid at 0x117705780>

<matplotlib.axes._subplots.AxesSubplot at 0x119d206a0>

In [4]:
def feature_parse(df):
    df['SibSp_Parch'] = df['SibSp'] + df['Parch']
    df.Embarked.fillna(df.Embarked.mode()[0], inplace=True)

    df["Fare"].fillna(14.435422,inplace=True)

    df['Name1'] = df['Name'].str.extract('.+,(.+)', expand=False).str.extract('^(.+?)\.', expand=False).str.strip()

    df['Name1'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer' , inplace = True)
    df['Name1'].replace(['Jonkheer', 'Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty' , inplace = True)
    df['Name1'].replace(['Mme', 'Ms', 'Mrs'], 'Mrs', inplace = True)
    df['Name1'].replace(['Mlle', 'Miss'], 'Miss', inplace = True)
    df['Name1'].replace(['Mr'], 'Mr' , inplace = True)
    df['Name1'].replace(['Master'], 'Master' , inplace = True)
    df = pd.get_dummies(df,columns=['Pclass','Sex','SibSp','Parch','SibSp_Parch', "Embarked", 'Name1'])

    df['Name2'] = df['Name'].apply(lambda x: x.split('.')[1])
    Name2_sum = df['Name2'].value_counts().reset_index()
    Name2_sum.columns=['Name2','Name2_sum']
    df = pd.merge(df,Name2_sum,how='left',on='Name2')

    #由于出现一次时该特征时无效特征,用one来代替出现一次的姓
    df.loc[df['Name2_sum'] == 1 , 'Name2_new'] = 'one'
    df.loc[df['Name2_sum'] > 1 , 'Name2_new'] = df['Name2']
    del df['Name2']
    df = pd.get_dummies(df,columns=['Name2_new'])
    del df['Name']



    df['Ticket_Letter'] = df['Ticket'].str.split().str[0]
    df['Ticket_Letter'] = df['Ticket_Letter'].apply(lambda x:np.nan if x.isnumeric() else x)
    df.drop('Ticket',inplace=True,axis=1)
    df = pd.get_dummies(df,columns=['Ticket_Letter'],drop_first=True)




    df.loc[df["Age"].isnull() ,"age_nan"] = 1
    df.loc[df["Age"].notnull() ,"age_nan"] = 0
    df = pd.get_dummies(df,columns=['age_nan'])

    missing_age = df.drop(['Survived','Cabin'],axis=1)

    #分列处理


    #将Age完整的项作为训练集、将Age缺失的项作为测试集。
    missing_age_train = missing_age[missing_age['Age'].notnull()]
    missing_age_test = missing_age[missing_age['Age'].isnull()]
    missing_age_X_train = missing_age_train.drop(['Age'], axis=1)
    missing_age_Y_train = missing_age_train['Age']
    missing_age_X_test = missing_age_test.drop(['Age'], axis=1)

    # 先将数据标准化
    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    #用测试集训练并标准化


    ss.fit(missing_age_X_train)
    missing_age_X_train = ss.transform(missing_age_X_train)
    missing_age_X_test = ss.transform(missing_age_X_test)
    lin = linear_model.BayesianRidge()
    lin.fit(missing_age_X_train,missing_age_Y_train)
    df.loc[(df['Age'].isnull()), 'Age'] = lin.predict(missing_age_X_test)

    df['Age'] = pd.cut(df['Age'], bins=[0,10,18,30,50,100],labels=[1,2,3,4,5])
    df = pd.get_dummies(df,columns=['Age'])


    df['Cabin_nan'] = df['Cabin'].apply(lambda x:str(x)[0] if pd.notnull(x) else x)
    df = pd.get_dummies(df,columns=['Cabin_nan'])
    df.loc[df["Cabin"].isnull() ,"Cabin_nan"] = 1
    df.loc[df["Cabin"].notnull() ,"Cabin_nan"] = 0
    df = pd.get_dummies(df,columns=['Cabin_nan'])
    df.drop(['Cabin', 'PassengerId'],axis=1,inplace=True)
    
    return df


In [6]:
def split_data(df, train_num):
    train_data = df[:train_num]
    test_data = df[train_num:]
    train_data_X = train_data.drop(['Survived'],axis=1)
    train_data_Y = train_data['Survived']
    test_data_X = test_data.drop(['Survived'],axis=1)
    return train_data_X, train_data_Y, test_data_X

test_data['Survived'] = 0
train_test = train_data.append(test_data)
train_test.info()
parsed_data = feature_parse(train_test)


train_data_X, train_data_Y, test_data_X = split_data(parsed_data, train_data.shape[0])
train_data_X.head()
train_data_X.info()

ss2 = StandardScaler()
ss2.fit(train_data_X)
train_data_X_sd = ss2.transform(train_data_X)
test_data_X_sd = ss2.transform(test_data_X)
train_data_X_sd


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       1309 non-null int64
Ticket         1309 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 132.9+ KB


  return self.partial_fit(X, y)


Unnamed: 0,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Cabin_nan_A,Cabin_nan_B,Cabin_nan_C,Cabin_nan_D,Cabin_nan_E,Cabin_nan_F,Cabin_nan_G,Cabin_nan_T,Cabin_nan_0.0,Cabin_nan_1.0
0,7.25,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,71.2833,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
2,7.925,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,53.1,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,8.05,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Columns: 195 entries, Fare to Cabin_nan_1.0
dtypes: float64(1), int64(1), uint8(193)
memory usage: 188.8 KB


  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)



array([[-0.50244517, -0.56568542, -0.51015154, ..., -0.03352008,
        -0.54492498,  0.54492498],
       [ 0.78684529,  1.76776695, -0.51015154, ..., -0.03352008,
         1.835115  , -1.835115  ],
       [-0.48885426, -0.56568542, -0.51015154, ..., -0.03352008,
        -0.54492498,  0.54492498],
       ...,
       [-0.17626324, -0.56568542, -0.51015154, ..., -0.03352008,
        -0.54492498,  0.54492498],
       [-0.04438104,  1.76776695, -0.51015154, ..., -0.03352008,
         1.835115  , -1.835115  ],
       [-0.49237783, -0.56568542, -0.51015154, ..., -0.03352008,
        -0.54492498,  0.54492498]])

In [7]:
train_data_X.head()
train_data_X.info()

Unnamed: 0,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Cabin_nan_A,Cabin_nan_B,Cabin_nan_C,Cabin_nan_D,Cabin_nan_E,Cabin_nan_F,Cabin_nan_G,Cabin_nan_T,Cabin_nan_0.0,Cabin_nan_1.0
0,7.25,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,71.2833,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
2,7.925,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,53.1,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,8.05,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Columns: 195 entries, Fare to Cabin_nan_1.0
dtypes: float64(1), int64(1), uint8(193)
memory usage: 188.8 KB


In [None]:
def eval_result(model, x, y):
    pred = model.predict(x)
    acc = accuracy_score(pred, y)
    auc = roc_auc_score(pred, y)
    
    return dict(acc=acc,auc=auc)

eval_result(best_model, train_data_X_sd, train_data_Y)

In [16]:
def grid_train(clazz, best_params, grid_params, n_fold=5, scoring= 'roc_auc'):
    best_params = dict(n_estimators = 115, max_depth = 15, learning_rate=0.05,
                   subsample= 0.1, reg_alpha=0.05, reg_lambda=0.0, num_leaves=48)
    model = clazz(**best_params)
    clf = GridSearchCV(model, grid_params, cv=n_fold, n_jobs=1, verbose=1, scoring=scoring)
    clf.fit(train_data_X_sd, train_data_Y)
    detail = clf.cv_results_
    best_params.update(clf.best_params_)
    best_model = clazz(**best_params)
    return detail, best_params, best_model
    


In [18]:
best_params = {'learning_rate': 0.05,
  'max_depth': 15,
  'n_estimators': 100,
  'num_leaves': 48,
  'reg_alpha': 0.05,
  'reg_lambda': 0.0,
  'subsample': 0.1},

grid_params = dict(n_estimators = [50,75,100, 150,200],
                  subsample= [0.1,0.2,0.3,0.4,0.5],
                   max_depth = [5,10,15,20,25])
clazz = lgbm.LGBMClassifier
detail, nest_param, best_lgbm = grid_train(clazz, best_params, grid_params)



Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 625 out of 625 | elapsed: 10.3min finished


({'mean_fit_time': array([0.18048449, 0.14693427, 0.14192662, 0.15140514, 0.16030822,
         0.19293785, 0.1913074 , 0.22345662, 0.25505118, 0.21579518,
         0.23977504, 0.22718439, 0.22665749, 0.24056678, 0.29609962,
         0.36091838, 0.37112679, 0.34546738, 0.36721506, 0.33169899,
         0.41961823, 0.44675999, 0.45850124, 0.45239191, 0.46020679,
         0.25366445, 0.23899188, 0.23527775, 0.24234452, 0.24955258,
         0.3628376 , 0.35122385, 0.34037585, 0.37172174, 0.39317956,
         0.395086  , 0.40511475, 0.45792074, 0.4497056 , 0.4398284 ,
         0.67002144, 0.67839317, 0.68353372, 0.59023185, 0.56567154,
         0.77704201, 0.75448503, 0.7577548 , 0.8584466 , 0.78418365,
         0.22914047, 0.23799672, 0.22862005, 0.22126923, 0.22194157,
         0.32851152, 0.35017743, 0.36000419, 0.33453588, 0.32858839,
         0.43997941, 0.47943783, 0.44619222, 0.46690359, 0.43763986,
         0.64204993, 0.70734868, 0.70109739, 0.71646714, 0.71245837,
         0.889398

In [122]:
from sklearn.linear_model import LogisticRegression

best_params = dict(max_iter=80, C= 0.01)
model = LogisticRegression(**best_params)

param = dict(C=[0.01, 0.001, 0.02, 0.005])

clf = GridSearchCV(model, param, cv=5, n_jobs=1, verbose=1, scoring="roc_auc")
clf.fit(train_data_X_sd, train_data_Y)

# 打印参数的得分情况
clf.cv_results_
# 打印最佳参数
clf.best_params_
clf.best_score_
best_lr = clf.best_estimator_




Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=80, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'C': [0.01, 0.001, 0.02, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)



{'mean_fit_time': array([0.02083015, 0.00884614, 0.01865702, 0.00990977]),
 'mean_score_time': array([0.00327182, 0.00207925, 0.00203381, 0.00151067]),
 'mean_test_score': array([0.86934507, 0.85611224, 0.86873922, 0.86797817]),
 'mean_train_score': array([0.92173275, 0.89340411, 0.92756923, 0.91425499]),
 'param_C': masked_array(data=[0.01, 0.001, 0.02, 0.005],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.01}, {'C': 0.001}, {'C': 0.02}, {'C': 0.005}],
 'rank_test_score': array([1, 4, 2, 3], dtype=int32),
 'split0_test_score': array([0.84505929, 0.82134387, 0.84374177, 0.84453228]),
 'split0_train_score': array([0.92263886, 0.89482006, 0.92812085, 0.91554649]),
 'split1_test_score': array([0.82503294, 0.82747036, 0.82252964, 0.82608696]),
 'split1_train_score': array([0.93183809, 0.90213355, 0.93718658, 0.92416164]),
 'split2_test_score': array([0.88983957, 0.87513369, 0.89064171, 0.88756684]),
 'split2_train_s

{'C': 0.01}

0.869345065786924

In [126]:

best_params = dict(n_estimators=80,min_samples_leaf=2,max_depth=7,oob_score=True)
model = RandomForestClassifier(**best_params)

param = dict(min_samples_leaf=[1,2,3,4,5])

clf = GridSearchCV(model, param, cv=5, n_jobs=1, verbose=1, scoring="roc_auc")
clf.fit(train_data_X_sd, train_data_Y)

# 打印参数的得分情况
clf.cv_results_
# 打印最佳参数
clf.best_params_
clf.best_score_
best_rf = clf.best_estimator_



Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    3.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'min_samples_leaf': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)



{'mean_fit_time': array([0.12622166, 0.10513282, 0.10966315, 0.11049008, 0.09939613]),
 'mean_score_time': array([0.00973563, 0.00945997, 0.0097127 , 0.00999851, 0.00905838]),
 'mean_test_score': array([0.86897652, 0.87158194, 0.8679634 , 0.86881835, 0.86486321]),
 'mean_train_score': array([0.9269335 , 0.90976297, 0.90032629, 0.89670601, 0.89153403]),
 'param_min_samples_leaf': masked_array(data=[1, 2, 3, 4, 5],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_leaf': 1},
  {'min_samples_leaf': 2},
  {'min_samples_leaf': 3},
  {'min_samples_leaf': 4},
  {'min_samples_leaf': 5}],
 'rank_test_score': array([2, 1, 4, 3, 5], dtype=int32),
 'split0_test_score': array([0.85      , 0.85652174, 0.85461133, 0.85375494, 0.84018445]),
 'split0_train_score': array([0.92888016, 0.91581767, 0.90283862, 0.90041052, 0.8982995 ]),
 'split1_test_score': array([0.83017128, 0.83214756, 0.83234519, 0.83043478, 0.8326087 ])

{'min_samples_leaf': 2}

0.8715819360838856

In [70]:

rf = RandomForestClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6,oob_score=True)
rf.fit(train_data_X,train_data_Y)

test_data["Survived"] = rf.predict(test_data_X)
RF = test_data[['PassengerId','Survived']].set_index('PassengerId')
RF.to_csv('RF.csv')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [139]:
from sklearn import svm

best_params = dict(C=0.5,max_iter=350, probability=True)
model = svm.SVC(**best_params)

param = dict(C=[0.5,1,1.5])

clf = GridSearchCV(model, param, cv=5, n_jobs=1, verbose=1, scoring="roc_auc")
clf.fit(train_data_X_sd, train_data_Y)

# 打印参数的得分情况
clf.cv_results_
# 打印最佳参数
clf.best_params_
clf.best_score_
best_svm = clf.best_estimator_


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    8.3s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=350, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'C': [0.5, 1, 1.5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=1)



{'mean_fit_time': array([0.45916414, 0.46238079, 0.44962864]),
 'mean_score_time': array([0.02009602, 0.02110949, 0.01882019]),
 'mean_test_score': array([0.8382128 , 0.83616546, 0.83446574]),
 'mean_train_score': array([0.9011646 , 0.9198269 , 0.92507828]),
 'param_C': masked_array(data=[0.5, 1, 1.5],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.5}, {'C': 1}, {'C': 1.5}],
 'rank_test_score': array([1, 2, 3], dtype=int32),
 'split0_test_score': array([0.79789196, 0.80513834, 0.80948617]),
 'split0_train_score': array([0.90741112, 0.9243911 , 0.92929735]),
 'split1_test_score': array([0.81541502, 0.81607378, 0.81620553]),
 'split1_train_score': array([0.90666433, 0.91674385, 0.9264312 ]),
 'split2_test_score': array([0.85508021, 0.84705882, 0.85427807]),
 'split2_train_score': array([0.90347173, 0.92063914, 0.92755599]),
 'split3_test_score': array([0.83596257, 0.83034759, 0.82299465]),
 'split3_train_score': array([0.

{'C': 0.5}

0.8382127976251005

In [19]:
def get_voting_model(model_dict, x, y, voting= 'hard'):
    vot_model = VotingClassifier(estimators=model_dict.items(),
                           voting='soft')
    vot_model.fit(x,y)
    return vot_model
    
# model_dict.update(vot_hard = vot_hard)

In [206]:
def predict(model, x, adjust=0.37320):    
    arr = model.predict(x)
    print("origin rate:", sum(arr)/arr.shape[0])
    if adjust:
        tmp = model.predict_proba(x)[:,-1]
        rank = int(tmp.shape[0] * adjust)
        threshold = np.sort(tmp)[::-1][rank]
        arr = np.array([1 if e>=threshold else 0 for e in tmp])
    rs_df = pd.DataFrame(arr, columns=['Survived'], dtype=np.int)   
    return rs_df


df = predict(vot, train_data_X_sd)
rs_df = get_result(test_data, df, label=  True)
rs_df.describe()
df.describe()



    
    
    
    
    
    
    

0.35353535353535354


Unnamed: 0,PassengerId,Survived
count,418.0,891.0
mean,1100.5,0.373737
std,120.810458,0.484067
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


Unnamed: 0,Survived
count,891.0
mean,0.373737
std,0.484067
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [150]:
from imp import reload
from utils import  get_today_str






def get_result(df, label_arr, label, record=False):
    id_series = df['PassengerId'].astype(np.int)
    label_series = label_arr
    rs_df = pd.concat([id_series, label_series], axis=1)
    if record:
        path = 'submit/titanic/{label}-{day}.csv'.format(day=get_today_str(), label=label)
        rs_df.to_csv(path,index=False)
    return rs_df

test_y_df = predict(best_model, test_data_X_sd)
# test_y = np.ones(test_data_X.shape[0])

# test_y
# test_data








0.37320574162679426


In [170]:
# 划分train数据集,调用代码,把数据集名字转成和代码一样
def staking_model(X, y, X_predict, clfs, n_folds-5):
    

    skf = StratifiedKFold(n_splits=n_folds)
    skf.get_n_splits(X, y)

    secode_x = np.zeros((X.shape[0], len(clfs)))
    secode_test_x = np.zeros((X_predict.shape[0], len(clfs)))

    for i, m in enumerate(clfs):    
        pred_matrix = np.zeros((X_predict.shape[0], n_folds))

        for j, (train_idx, test_idx) in enumerate(skf.split(X,y)):
            tmp_x = X[train_idx]
            tmp_y = y[train_idx]
            pred_x = X[test_idx]
            m = m.fit(tmp_x, tmp_y)
            secode_x[test_idx, i] = m.predict_proba(pred_x)[:,1]
            pred_matrix[:,j] = m.predict_proba(X_predict)[:,1]

        secode_test_x[:,i] = pred_matrix.mean(axis=1)
        clf2.fit(secode_x, y)
        tmp = predict(clf2,secode_test_x)
        return tmp
        
                
        
        
        
        
        
        
        
        
        
        
        



5



array([[0.21834427, 0.1958945 , 0.15457846, 0.07003827],
       [0.93222684, 0.89787837, 0.91714896, 0.97558928],
       [0.45517885, 0.47935707, 0.31742547, 0.37637343],
       ...,
       [0.5043876 , 0.46928658, 0.45154411, 0.29441349],
       [0.43776052, 0.36467748, 0.20917513, 0.57883151],
       [0.13221553, 0.11764616, 0.17126467, 0.01318203]])

array([[0.118446  , 0.10776044, 0.15508261, 0.01205484],
       [0.65619667, 0.54203836, 0.83882433, 0.42008801],
       [0.22668474, 0.14184665, 0.15494618, 0.03299659],
       ...,
       [0.13857309, 0.14245641, 0.14519169, 0.02071692],
       [0.08675219, 0.10816176, 0.16687677, 0.06045393],
       [0.65652502, 0.55865447, 0.79386104, 0.84697805]])

In [177]:
clf2 = LogisticRegression(C=0.1,max_iter=100)
clf2.fit(secode_x, train_data_Y)
tmp = predict(clf2,secode_test_x)
tmp.shape

get_result(test_data, tmp, label='stacking', record=True)





LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

0.3708133971291866


(418, 1)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [211]:
import keras
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras import regularizers

train_data_X_sd.shape
train_data_Y.reshape((-1,1)).shape


k_model = Sequential()
k_model.add(Dense(units=256, input_dim = train_data_X_sd.shape[1], activation="relu"))
k_model.add(Dense(units=128,activation="relu"))
k_model.add(Dense(units=64,activation="relu"))
k_model.add(Dense(units=64,activation="relu"))
k_model.add(Dense(units=1,activation="sigmoid",activity_regularizer=regularizers.l2(0.02)))



k_model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
k_model.fit(train_data_X_sd, train_data_Y, validation_split=0.1, epochs=50, batch_size=32,
            callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=0,
                              verbose=1, mode='auto')])








# k_model = 

(891, 195)

  


(891, 1)

ValueError: Unknown metric function:roc-auc

In [213]:
tmp = predict(k_model, test_data_X_sd)
get_result(test_data, tmp, 'keras', True)

# k_model.predict_proba(train_data_X_sd)

[0.47081625]


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
5,897,1
6,898,0
7,899,1
8,900,1
9,901,1
