# 数据预处理

In [1]:
import pandas as pd

In [2]:
#删除缺失值比较高的列
def drop_col(df,null_ratio):
    total = len(df.index)
    for i in df.columns:
        null_cnt = total - df[i].count()
        if 1.0*null_cnt/total > 0.3:
            df = df.drop([i],axis=1)
    return df

#嵌入式选取特征
def Select_Best_X(X,y,C_):
    from sklearn.svm import LinearSVC
    from sklearn.feature_selection import SelectFromModel
    lsvc = LinearSVC(C = C_, penalty="l1", dual=False).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = pd.DataFrame(model.transform(X))
    col_lst = []
    for i in range(len(lsvc.coef_[0])):
        if lsvc.coef_[0][i] != 0:
            col_lst.append(X.columns[i])
    return X_new,col_lst 

In [3]:
#获取数据集
data = pd.read_csv('model_data.csv')
train = data[data['dataset'] == 'train']
x = train.drop(['label','id','dataset'],axis=1)
y = train['label']

In [4]:
#删除缺失值比较高的列
X = drop_col(x,0.3)

In [5]:
#缺失值填充
import copy
X_ = copy.copy(X)
X_.fillna(X_.median(),inplace=True)

In [6]:
#特征筛选
y = train['label']
X_new,X_col = Select_Best_X(X_, y,0.05)

In [7]:
#数据集划分
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size = 0.3,random_state = 1)

# Xgboost模型

In [8]:
#导入相关包
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


In [9]:
#Xgboost模型
def modelfit(alg, X_train, y_train, X_test, y_test, useTrainCV=True, cv_folds = 5, early_stopping_rounds=50):
    
    #使用CV交叉验证
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, label = y_train )
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    #模型训练
    alg.fit(X_train, y_train, eval_metric='auc')

    #模型预测
    dtrain_predictions = alg.predict(X_test)
    dtrain_predprob = alg.predict_proba(X_test)[:,1]

    #模型检验
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y_test, dtrain_predictions))
    print("AUC Score (test): %f" % metrics.roc_auc_score(y_test, dtrain_predprob))
    return alg

In [10]:
#最优深度及权重
param_test_xgboost = {
 'max_depth':list(range(3,11,2)),
 'min_child_weight':list(range(1,6,1))
}
gsearch_xg = GridSearchCV(estimator = XGBClassifier( learning_rate =0.05, n_estimators=1000,
        gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test_xgboost, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch_xg.fit(X_train,y_train)
gsearch_xg.grid_scores_,gsearch_xg.best_params_, gsearch_xg.best_score_  



([mean: 0.89511, std: 0.03565, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.88699, std: 0.03336, params: {'max_depth': 3, 'min_child_weight': 2},
  mean: 0.88363, std: 0.03527, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.88414, std: 0.03547, params: {'max_depth': 3, 'min_child_weight': 4},
  mean: 0.88433, std: 0.03737, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.90415, std: 0.03401, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.89914, std: 0.03496, params: {'max_depth': 5, 'min_child_weight': 2},
  mean: 0.89227, std: 0.03951, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.88953, std: 0.03802, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: 0.88542, std: 0.04038, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.90884, std: 0.03425, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.90196, std: 0.03910, params: {'max_depth': 7, 'min_child_weight': 2},
  mean: 0.89666, std: 0.03866, params: {

In [11]:
#最终参数模型训练
xgb1 = XGBClassifier(
 learning_rate =0.05,
 n_estimators=1000,
 max_depth=gsearch_xg.best_params_['max_depth'],
 min_child_weight=gsearch_xg.best_params_['min_child_weight'],
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

Xgboost_model = modelfit(xgb1, X_train, y_train, X_test, y_test)

  if getattr(data, 'base', None) is not None and \



Model Report
Accuracy : 0.9411
AUC Score (test): 0.918764


  if diff:


In [12]:
#测试数据集处理
test = data[data['dataset'] == 'test']
X_pred = test[X_col]
X_pred.fillna(X_.median(),inplace=True)
X_pred.columns = X_new.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [13]:
#测试数据集预测
Xgboost_pred = Xgboost_model.predict_proba(X_pred)[:,1]

# 随机森林

In [14]:
#随机森林 
import pandas as pd  
import numpy as np  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.grid_search import GridSearchCV  
from sklearn import cross_validation, metrics  



In [15]:
#使用默认参数做拟合
rf0 = RandomForestClassifier(oob_score=True, random_state=10)  
rf0.fit(X_train,y_train)
rf_y_pred = rf0.predict(X_test)
rf_y_predprob = rf0.predict_proba(X_test)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y_test, rf_y_pred))
print("AUC Score (test): %f" % metrics.roc_auc_score(y_test,rf_y_predprob)) 

Accuracy : 0.9222
AUC Score (test): 0.901398


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [16]:
#最优深度及权重
param_test_rf= {'max_depth':list(range(3,14,2)), 'min_samples_split':list(range(50,201,20))}  
gsearch_rf= GridSearchCV(estimator = RandomForestClassifier(n_estimators= 100,  
                                 min_samples_leaf=20,max_features='sqrt' ,oob_score=True,random_state=10),  
   param_grid = param_test_rf,scoring='roc_auc',iid=False, cv=5)  
gsearch_rf.fit(X_new,y)  
gsearch_rf.grid_scores_,gsearch_rf.best_params_, gsearch_rf.best_score_  

([mean: 0.84736, std: 0.02738, params: {'max_depth': 3, 'min_samples_split': 50},
  mean: 0.84529, std: 0.02867, params: {'max_depth': 3, 'min_samples_split': 70},
  mean: 0.84590, std: 0.02882, params: {'max_depth': 3, 'min_samples_split': 90},
  mean: 0.84469, std: 0.02829, params: {'max_depth': 3, 'min_samples_split': 110},
  mean: 0.84431, std: 0.02893, params: {'max_depth': 3, 'min_samples_split': 130},
  mean: 0.84396, std: 0.02923, params: {'max_depth': 3, 'min_samples_split': 150},
  mean: 0.84328, std: 0.02918, params: {'max_depth': 3, 'min_samples_split': 170},
  mean: 0.84319, std: 0.02912, params: {'max_depth': 3, 'min_samples_split': 190},
  mean: 0.86073, std: 0.02533, params: {'max_depth': 5, 'min_samples_split': 50},
  mean: 0.85883, std: 0.02258, params: {'max_depth': 5, 'min_samples_split': 70},
  mean: 0.85522, std: 0.02284, params: {'max_depth': 5, 'min_samples_split': 90},
  mean: 0.85534, std: 0.02231, params: {'max_depth': 5, 'min_samples_split': 110},
  mean: 0.

In [17]:
#调参结果不理想，使用默认参数，测试数据集预测
RF_pred = rf0.predict_proba(X_pred)[:,1]

# LR模型

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import stochastic_gradient
from sklearn.metrics import classification_report

In [19]:
#标准化数据，保证每个维度的特征数据方差为1，均值为0。使得预测结果不会被某些维度过大的特征值而主导。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test=ss.transform(X_test)
 
#使用逻辑斯蒂回归
lr = LogisticRegression()   
lr.fit(X_train,y_train)   
lr_y_pred = lr.predict(X_test)
lr_y_predprob = lr.predict_proba(X_test)
print("Accuracy : %.4g" % metrics.accuracy_score(y_test, rf_y_pred))
print("AUC Score (test): %f" % metrics.roc_auc_score(y_test,rf_y_predprob))

Accuracy : 0.9222
AUC Score (test): 0.901398


In [20]:
#调参
#容忍度以及惩罚系数
param_test_lr= {'penalty':['l1','l2'],'tol':list(np.arange(1e-5,1e-3,5e-5)),'C':list(np.arange(0.1,2,0.2))}  
gsearch_lr= GridSearchCV(estimator = LogisticRegression(),
    param_grid = param_test_lr,scoring='roc_auc',iid=False, cv=5)
gsearch_lr.fit(X_train,y_train)  
gsearch_lr.grid_scores_,gsearch_lr.best_params_, gsearch_lr.best_score_  

([mean: 0.84014, std: 0.02467, params: {'C': 0.1, 'penalty': 'l1', 'tol': 1e-05},
  mean: 0.84015, std: 0.02466, params: {'C': 0.1, 'penalty': 'l1', 'tol': 6e-05},
  mean: 0.84014, std: 0.02466, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00011},
  mean: 0.84015, std: 0.02466, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00016},
  mean: 0.84013, std: 0.02469, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00021},
  mean: 0.84014, std: 0.02468, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00026000000000000003},
  mean: 0.84013, std: 0.02469, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00031000000000000005},
  mean: 0.84012, std: 0.02468, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00036},
  mean: 0.84011, std: 0.02468, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00041000000000000005},
  mean: 0.84013, std: 0.02469, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00046000000000000007},
  mean: 0.84014, std: 0.02468, params: {'C': 0.1, 'penalty': 'l1', 'tol': 0.00051},
  mean: 0.84015, std

In [21]:
#调参结果不理想，使用默认参数，测试数据集预测
X_pred=ss.transform(X_pred)
Lr_pred = lr.predict_proba(X_pred)[:,1]

# 输出最终预测结果

In [22]:
result = pd.DataFrame(index=test.index)
result['lr'] = Lr_pred
result['xgboost'] = Xgboost_pred
result['rf'] = RF_pred

In [25]:
result

Unnamed: 0,lr,xgboost,rf
3000,0.080635,0.013583,0.0
3001,0.065846,0.232370,0.1
3002,0.064648,0.165907,0.5
3003,0.034393,0.000860,0.0
3004,0.034393,0.000860,0.0
3005,0.078036,0.006548,0.1
3006,0.048746,0.004114,0.1
3007,0.032291,0.095668,0.1
3008,0.020689,0.001728,0.0
3009,0.020689,0.001728,0.0


In [26]:
result.to_csv('task_1_results.csv')