In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.facecolor']=(1,1,1,1) # pycharm 绘图白底，看得清坐标
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

train = pd.read_csv("./train_set.csv")
test = pd.read_csv("./test_set.csv")

In [3]:
# 对 'default','housing','loan' 3列二值(yes,no)特征转为 0，1
def binaryFeature(data):
    data['default_']=0
    data['default_'][data['default']=='yes'] = 1
    data['housing_']=0
    data['housing_'][data['housing']=='yes'] = 1
    data['loan_']=0
    data['loan_'][data['loan']=='yes'] = 1
    return data.drop(['default','housing','loan'], axis=1)

X_train = binaryFeature(train)
X_test = binaryFeature(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [4]:
X_train = X_train.drop(['ID'], axis=1)
X_test = X_test.drop(['ID'], axis=1)

# 将训练集拆分一些出来做验证, 分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
splt = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
for train_idx, vaild_idx in splt.split(X_train, X_train['y']):
    train_part = X_train.loc[train_idx]
    valid_part = X_train.loc[vaild_idx]

# 训练集拆成两部分 本地测试
train_part_y = train_part['y']
valid_part_y = valid_part['y']
train_part = train_part.drop(['y'], axis=1)
valid_part = valid_part.drop(['y'], axis=1)

In [5]:
def num_cat_splitor(X_train):
    s = (X_train.dtypes == 'object')
    object_cols = list(s[s].index)
    num_cols = list(set(X_train.columns) - set(object_cols))
    return num_cols, object_cols

num_cols, object_cols = num_cat_splitor(X_train)
num_cols.remove('y')

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_cols)),
#         ('imputer', SimpleImputer(strategy="median")),
#         ('std_scaler', StandardScaler()),
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(object_cols)),
        ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ])
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [6]:
# 本地测试，选模型
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier()
knn = KNeighborsClassifier()
lr = LogisticRegression()
svc = SVC(probability=True)
gbdt = GradientBoostingClassifier()

In [None]:
models = [knn, lr, svc, rf, gbdt]
param_grid_list = [
    # knn
    [{
        'model__n_neighbors' : [5,15,35,50,100],
        'model__leaf_size' : [10,20,30,40,50]
    }],
    # lr
    [{
        'model__penalty' : ['l1', 'l2'],
        'model__C' : [0.2, 0.5, 1, 1.2, 1.5],
        'model__max_iter' : [10000]
    }],
    # svc
    [{
        'model__C' : [0.2, 0.5, 1, 1.2],
        'model__kernel' : ['rbf']
    }],
    # rf
    [{
    #     'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
        'model__n_estimators' : [200,250,300,330,350],
        'model__max_features' : [20,30,40,50],
        'model__max_depth' : [5,7]
    }],
    # gbdt
    [{
        'model__learning_rate' : [0.1, 0.5],
        'model__n_estimators' : [130, 200, 300],
        'model__max_features' : ['sqrt'],
        'model__max_depth' : [5,7],
        'model__min_samples_split' : [500,1000,1200],
        'model__min_samples_leaf' : [60, 100],
        'model__subsample' : [0.8, 1]
    }],
]

for i, model in enumerate(models):
    pipe = Pipeline([
        ('preparation', full_pipeline),
        ('model', model)
    ])
    grid_search = GridSearchCV(pipe, param_grid_list[i], cv=3,
                                    scoring='roc_auc', verbose=2, n_jobs=-1)
    grid_search.fit(train_part, train_part_y)
    print(grid_search.best_params_)
    final_model = grid_search.best_estimator_
    pred = final_model.predict_proba(valid_part)[:,1] # roc 必须使用概率预测
    print("auc score: ", roc_auc_score(valid_part_y, pred))

In [None]:
# 全量训练，网格搜索参数，提交
y_train = X_train['y']
X_train_ = X_train.drop(['y'], axis=1)

select_model = [rf, gbdt]
param_grid_list = [
    # rf
    [{
    #     'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
        'model__n_estimators' : [250,300,350,400],
        'model__max_features' : [7,8,10,15,20],
        'model__max_depth' : [7,9,10,11]
    }],
    # gbdt
    [{
        'model__learning_rate' : [0.03, 0.05, 0.1],
        'model__n_estimators' : [200, 300, 350],
        'model__max_features' : ['sqrt'],
        'model__max_depth' : [7,9,11],
        'model__min_samples_split' : [300, 400, 500],
        'model__min_samples_leaf' : [50,60,70],
        'model__subsample' : [0.8, 1, 1.2]
    }],
]

for i, model in enumerate(select_model):
    pipe = Pipeline([
        ('preparation', full_pipeline),
        ('model', model)
    ])
    grid_search = GridSearchCV(pipe, param_grid_list[i], cv=3,
                                    scoring='roc_auc', verbose=2, n_jobs=-1)
    grid_search.fit(X_train_, y_train)
    print(grid_search.best_params_)
    final_model = grid_search.best_estimator_
    pred = final_model.predict_proba(X_test)[:,1] # roc 必须使用概率预测
    print(model,'\n finished!')
    result = pd.DataFrame()
    result['ID'] = test['ID']
    result['pred'] = pred
    result.to_csv('{}_pred.csv'.format(i), index=False)

In [9]:
# 随机搜索参数
y_train = X_train['y']
X_train_ = X_train.drop(['y'], axis=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
select_model = [rf, gbdt]
param_distribs = [
    # rf
    [{
    #     'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
        'model__n_estimators' : randint(low=250, high=500),
        'model__max_features' : randint(low=10, high=30),
        'model__max_depth' : randint(low=8, high=20)
    }],
    # gbdt
    [{
        'model__learning_rate' : np.linspace(0.01, 0.1, 10),
        'model__n_estimators' : randint(low=250, high=500),
        'model__max_features' : ['sqrt'],
        'model__max_depth' : randint(low=8, high=20),
        'model__min_samples_split' : randint(low=400, high=1000),
        'model__min_samples_leaf' : randint(low=40, high=80),
        'model__subsample' : np.linspace(0.5, 1.5, 10)
    }],
]

for i, model in enumerate(select_model):
    pipe = Pipeline([
        ('preparation', full_pipeline),
        ('model', model)
    ])
    rand_search = RandomizedSearchCV(pipe, param_distributions=param_distribs[i], cv=3,
                                    n_iter=20,scoring='roc_auc', verbose=2, n_jobs=-1)
    rand_search.fit(X_train_, y_train)
    print(rand_search.best_params_)
    final_model = rand_search.best_estimator_
    pred = final_model.predict_proba(X_test)[:,1] # roc 必须使用概率预测
    print(model,'\n finished!')
    result = pd.DataFrame()
    result['ID'] = test['ID']
    result['pred'] = pred
    result.to_csv('{}_pred.csv'.format(i), index=False)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min


KeyboardInterrupt: 

In [22]:
# 全量训练，提交
y_train = X_train['y']
X_train_ = X_train.drop(['y'], axis=1)

select_model = [rf, gbdt]
param_grid_list = [
    # rf
    [{
    #     'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
        'model__n_estimators' : [250,300,350,400],
        'model__max_features' : [10,15,20,30,40],
        'model__max_depth' : [5,7,9]
    }],
    # gbdt
    [{
        'model__learning_rate' : [0.05, 0.1, 0.2],
        'model__n_estimators' : [130, 200, 300],
        'model__max_features' : ['sqrt'],
        'model__max_depth' : [5,7,9],
        'model__min_samples_split' : [400, 500,1000],
        'model__min_samples_leaf' : [50,60],
        'model__subsample' : [0.8, 1, 1.2]
    }],
]

for i, model in enumerate(select_model):
    pipe = Pipeline([
        ('preparation', full_pipeline),
        ('model', model)
    ])
    grid_search = GridSearchCV(pipe, param_grid_list[i], cv=3,
                                    scoring='roc_auc', verbose=2, n_jobs=-1)
    grid_search.fit(X_train_, y_train)
    print(grid_search.best_params_)
    final_model = grid_search.best_estimator_
    pred = final_model.predict_proba(X_test)[:,1] # roc 必须使用概率预测
    print(model,'\n finished!')
    result = pd.DataFrame()
    result['ID'] = test['ID']
    result['pred'] = pred
    result.to_csv('{}_pred.csv'.format(i), index=False)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
{'model__max_depth': 9, 'model__max_features': 10, 'model__n_estimators': 300}
RandomForestClassifier() 
 finished!
Fitting 3 folds for each of 486 candidates, totalling 1458 fits
{'model__learning_rate': 0.05, 'model__max_depth': 9, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 60, 'model__min_samples_split': 400, 'model__n_estimators': 300, 'model__subsample': 1}
GradientBoostingClassifier() 
 finished!


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   54.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 11.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 23.8min
[Parallel(n_jobs=-1)]: Done 1458 out of 1458 | elapsed: 24.3min finished
