In [1]:
import pandas as pd

# 原始数据导入

In [2]:
train_org = pd.read_csv('./data/jiushu_train.csv')
test_org = pd.read_csv('./data/jiushu_test.csv')
print(train_org.shape, test_org.shape)

(210000, 146) (90000, 146)


In [3]:
train_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210000 entries, 0 to 209999
Columns: 146 entries, user_pin to label
dtypes: int64(145), object(1)
memory usage: 233.9+ MB


In [4]:
train_x = train_org.iloc[:, 1:-2]
train_y = train_org['label']
test_x = test_org.iloc[:, 1:-2]
test_y = test_org['label']

In [5]:
train_x.shape

(210000, 143)

In [6]:
train_y.sum()

69857

In [7]:
test_y.sum()

30143

# 数据预处理

## 每个特征缺失值比例查看

In [8]:
name_list = train_x.columns.values.tolist()

In [9]:
# 查看单个特征缺失值比例
def get_null_percent(chara_name):
    false_true_arr = train_org[chara_name].isnull()
    total = len(false_true_arr)
    true_result = 0
    for part in false_true_arr:
        if part == 'true':
            true_result += 1
    true_percent = true_result / total
    return true_percent

In [10]:
get_null_percent(name_list[0])

0.0

In [11]:
# 查看所有特征缺失值比例
def get_total_null_character(character_name_list):
    dic = {}
    for character in character_name_list:
        chara_null_percent = get_null_percent(character)
        dic[character] = chara_null_percent
    return dic

In [12]:
# 打印出有缺失值的特征
def get_null_character(dic_chara):
    for k, v in dic_chara.items():
        if v != 0.0:
            print(k)

In [13]:
dic_chara = get_total_null_character(name_list)
get_null_character(dic_chara)

## 去掉取值变化小的特征

In [14]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
result = sel.fit_transform(train_x)
result.shape

(210000, 17)

In [15]:
test_result = sel.fit_transform(test_x)
test_result.shape

(90000, 17)

## 特征选择

### 单个特征的差异性

In [16]:
from sklearn.feature_selection import SelectKBest

In [17]:
from sklearn.feature_selection import chi2

In [18]:
X_new = SelectKBest(chi2, k=10).fit_transform(result, train_y)

In [19]:
X_new.shape

(210000, 10)

In [20]:
X_new.shape

(210000, 10)

In [21]:
test_X_new = SelectKBest(chi2, k=10).fit_transform(test_result, test_y)
test_X_new.shape

(90000, 10)

# 模型选择

## SVM

In [22]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### matching

In [23]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_new, train_y)

In [24]:
y_pred = svm_model.predict(test_X_new)
print(accuracy_score(y_pred, test_y))

0.9957222222222222


### adapting

In [25]:
tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 
                     'C': [1, 10, 100, 1000]}]

In [None]:
clf = GridSearchCV(svm.SVC(), 
                    tuned_parameters, 
                    cv=2)
clf.fit(X_new, train_y)

In [None]:
print(clf.best_params_)
print(clf.score(test_X_new, test_y))
print(clf.best_score_)

In [None]:
# 对拟合好的参数进行验证
svm_model = svm.SVC(kernel='rbf', C=1, gamma=1e-3)
svm_model.fit(X_new, train_y)
y_pred = svm_model.predict(test_X_new)
print(accuracy_score(y_pred, test_y))

## GBDT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline

### matching

In [None]:
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X_new, train_y)

In [None]:
gbm_predict = gbm0.predict(test_X_new)
print(accuracy_score(gbm_predict, test_y))

### adapting

In [None]:
# 步长(learning rate)和迭代次数(n_estimators)
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    min_samples_split=300,
    min_samples_leaf=20,
    max_depth=8,
    max_features='sqrt', 
    subsample=0.8,
    random_state=10), 
param_grid = param_test1, scoring='roc_auc',iid=False,cv=5)
gsearch1.fit(X_new,train_y)

In [None]:
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# 'n_estimators': 50

In [None]:
# max_depth和min_samples_split
param_test2 = {'max_depth':range(3,11,2), 
               'min_samples_split':range(5,21,3)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=60, 
    min_samples_leaf=20, 
    max_features='sqrt', 
    subsample=0.8, 
    random_state=10), 
param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X_new,train_y)

In [None]:
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
# 'max_depth': 5, 'min_samples_split': 10

In [None]:
# min_samples_split和min_samples_leaf
param_test3 = {'min_samples_split':range(10,101,20), 
               'min_samples_leaf':range(10,101,20)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=60,
    max_depth=7,
    max_features='sqrt', 
    subsample=0.8, 
    random_state=10), 
param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(X_new,train_y)

In [None]:
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
# 'min_samples_leaf': 90, 'min_samples_split': 10

In [None]:
# 先拟合一下，看看结果
gbm1 = GradientBoostingClassifier(learning_rate=0.1, 
                                  n_estimators=50,
                                  max_depth=5, 
                                  min_samples_leaf =90, 
                                  min_samples_split =10, 
                                  max_features='sqrt', 
                                  subsample=0.8, 
                                  random_state=10)
gbm1.fit(X_new, train_y)
y_pred = gbm1.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

In [None]:
# 对最大特征数max_features进行网格搜索
param_test4 = {'max_features':range(3,10,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=50,
    max_depth=7, 
    min_samples_leaf =90, 
    min_samples_split =10, 
    subsample=0.8, 
    random_state=10), 
param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(X_new,train_y)

In [None]:
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
# 对子采样的比例进行网格搜索
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=60,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =1200, 
    max_features=9, 
    random_state=10), 
param_grid = param_test5, scoring='roc_auc',iid=False, cv=5)
gsearch5.fit(X_new,train_y)

In [None]:
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

In [None]:
# 可以减半步长，最大迭代次数加倍
gbm2 = GradientBoostingClassifier(
    learning_rate=0.05, 
    n_estimators=120,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =10, 
    max_features=9, 
    subsample=0.6, 
    random_state=10)
gbm2.fit(X_new,train_y)
y_pred = gbm2.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

In [None]:
# 继续将步长缩小5倍，最大迭代次数增加5倍
gbm3 = GradientBoostingClassifier(
    learning_rate=0.01, 
    n_estimators=600,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =10, 
    max_features=9, 
    subsample=0.7, 
    random_state=10)
gbm3.fit(X_new,train_y)
y_pred = gbm3.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

In [None]:
# 继续步长缩小一半，最大迭代次数增加2倍
gbm4 = GradientBoostingClassifier(
    learning_rate=0.005, 
    n_estimators=1200,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =10, 
    max_features=9, 
    subsample=0.7, 
    random_state=10)
gbm4.fit(X_new,train_y)
y_pred = gbm4.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

### matching

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_new, train_y)

In [None]:
rfc_y_predict = rfc.predict(test_X_new)
rfc_y_predict.sum()

In [None]:
print(accuracy_score(rfc_y_predict, test_y))

In [None]:
print(rfc.score(test_X_new, test_y))

In [None]:
print(classification_report(test_y, rfc_y_predict, target_names=["0", "1"]))

### adapting

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

import matplotlib.pylab as plt
%matplotlib inline

In [None]:
# 首先对n_estimators进行网格搜索, 得到了最佳的弱学习器迭代次数
param_test1 = {'n_estimators':range(10,101,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                                                           min_samples_leaf=20,
                                                           max_depth=8,
                                                           max_features='sqrt',
                                                           random_state=10), 
                                                           param_grid = param_test1, 
                                                           scoring='roc_auc',cv=5)
gsearch1.fit(X_new, train_y)

In [None]:
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# 'n_estimators': 60

In [None]:
# 对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索
param_test2 = {'max_depth':range(3,11,2), 'min_samples_split':range(10,51,10)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           min_samples_leaf=20,
                                                           max_features='sqrt',
                                                           oob_score=True, 
                                                           random_state=10),
param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X_new, train_y)

In [None]:
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
# 看看现在模型的袋外分数
rf1 = RandomForestClassifier(n_estimators= 60, 
                             max_depth=7, 
                             min_samples_split=50,
                             min_samples_leaf=20,
                             max_features='sqrt',
                             oob_score=True, 
                             random_state=10)
rf1.fit(X_new, train_y)
print(rf1.oob_score_)

In [None]:
# 对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参

param_test3 = {'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           max_depth=13,
                                                           max_features='sqrt',
                                                           oob_score=True, 
                                                           random_state=10),
param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(X_new, train_y)

In [None]:
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
# 对最大特征数max_features做调参
param_test4 = {'max_features':range(3,11,2)}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           max_depth=13, 
                                                           min_samples_split=120,
                                                           min_samples_leaf=20 ,
                                                           oob_score=True, 
                                                           random_state=10),
param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(X_new, train_y)

In [None]:
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
# 用我们搜索到的最佳参数，我们再看看最终的模型拟合
rf2 = RandomForestClassifier(n_estimators= 60, 
                             max_depth=13, 
                             min_samples_split=120,
                             min_samples_leaf=20,
                             max_features=7 ,
                             oob_score=True, 
                             random_state=10)
rf2.fit(X_new, train_y)
print(rf2.oob_score_)