说明：这个主题有意思的是，只有250个训练数据，而测试集差不多有20000个，需要从这250个数据中得到一个不过拟合模型来进行预测，这是很有挑战的。

总共有300个特征，我们可以先来看下数据的具体情况！

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def loaddata(file, train=True):
    if train:
        data_train = pd.read_csv(file)
        X_train = data_train[data_train.columns[2:]]
        y_train = data_train[data_train.columns[1]]
        return X_train, y_train
    else:
        data_test = pd.read_csv(file)
        X_test = data_test[data_test.columns[1:]]
        return X_test

In [3]:
X_train, y_train = loaddata('./数据集/train.csv')
X_test = loaddata('./数据集/test.csv', train=False)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)

In [4]:
X_train.shape

(250, 300)

In [None]:
y_train.shape

(250,)

In [None]:
X_test.shape

(19750, 300)

这些数据，其实都已经做了标准化处理，这边关系的就是通过何种技巧来使得得到的模型不过拟合！！

我们先使用简单的逻辑回归来试试~

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
Xtrain, Xval, ytrain, yval = train_test_split(X_train, y_train, test_size=0.25)

In [None]:
lr = LogisticRegression(solver='lbfgs')
gscv = GridSearchCV(lr, {'tol': [1e-4, 1e-5, 1e-6],
                         'C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
                         'random_state': range(101),
                         'max_iter': [100, 200, 500, 1000, 2000, 5000]}, cv=10, n_jobs=-1, iid=False)
gscv.fit(Xtrain, ytrain)
lr = LogisticRegression(solver='lbfgs', tol=gscv.best_params_['tol'], C=gscv.best_params_['C'],
                        random_state=gscv.best_params_['random_state'], max_iter=gscv.best_params_['max_iter'])
lr.fit(Xtrain, ytrain)
ypred = lr.predict(Xval)
roc_auc_score(yval, ypred)

0.7227564102564102

再来看下SVC

In [None]:
from sklearn.svm import SVC

In [None]:
svm_c = SVC()
gscv = GridSearchCV(svm_c, {'C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
                            'kernel': ['linear', 'rbf'],
                            'tol': [1e-3, 1e-4, 1e-5, 1e-6],
                            'random_state': range(101)}, cv=10, n_jobs=-1, iid=False)
gscv.fit(Xtrain, ytrain)
svm_c = SVC(C=gscv.best_params_['C'], kernel=gscv.best_params_['kernel'],
            tol=gscv.best_params_['tol'], random_state=gscv.best_params_['random_state'])
svm_c.fit(Xtrain, ytrain)
ypred = svm_c.predict(Xval)
roc_auc_score(yval, ypred)

再用一下随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
rfc = RandomForestClassifier()
gscv = GridSearchCV(rfc, {'n_estimators': [50, 80, 100, 200],
                          'max_depth': [3, 10, 30, 50],
                          'min_samples_split': [2, 5, 10],
                          'min_samples_leaf': [1, 2, 5],
                          'random_state': range(101)}, cv=10, n_jobs=-1, iid=False)
gscv.fit(Xtrain, ytrain)
rfc = RandomForestClassifier(n_estimators=gscv.best_params_['n_estimators'],
                             max_depth=gscv.best_params_['max_depth'],
                             min_samples_split=gscv.best_params_['min_samples_split'],
                             min_samples_leaf=gscv.best_params_['min_samples_leaf'],
                             random_state=gscv.best_params_['random_state'])
rfc.fit(Xtrain, ytrain)
ypred = rfc.predict(Xval)
roc_auc_score(yval, ypred)

In [None]:
gbc = GradientBoostingClassifier()
gscv = GridSearchCV(gbc, {'n_estimators': [50, 80, 100, 200],
                          'max_depth': [3, 5, 10, 30, 50],
                          'learning_rate': [0.05, 0.1, 0.2],
                          'min_samples_split': [2, 5, 10],
                          'min_samples_leaf': [1, 2, 5],
                          'subsample': [0.6, 0.8, 1.0],
                          'random_state': range(101)}, cv=10, n_jobs=-1, iid=False)
gscv.fit(Xtrain, ytrain)
gbc = GradientBoostingClassifier(loss='exponential',
                                 n_estimators=gscv.best_params_['n_estimators'],
                                 max_depth=gscv.best_params_['max_depth'],
                                 learning_rate=gscv.best_params_['learning_rate'],
                                 min_samples_split=gscv.best_params_['min_samples_split'],
                                 min_samples_leaf=gscv.best_params_['min_samples_leaf'],
                                 subsample=gscv.best_params_['subsample'],
                                 random_state=gscv.best_params_['random_state'])
gbc.fit(Xtrain, ytrain)
ypred = gbc.predict(Xval)
roc_auc_score(yval, ypred)

最后，再构造以下DNN来试一下，看看有什么不同没

In [None]:
from keras import models, layers, regularizers, initializers, callbacks

In [None]:
max_val_acc = []
max_auc_score = []
params = []

early_stopping = callbacks.EarlyStopping(patience=50) 

for hidden_size in range(32, 512, 32):
    for activation in ['relu', 'elu', 'selu']:
        for l1 in [0.001, 0.005, 0.01, 0.02, 0.05, 0.1]:
            for l2 in [0.001, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2]:
                for epochs in range(10, 200, 10):
                    for batch_size in range(10, 50, 10):
                        model = models.Sequential()
                        model.add(layers.Dense(hidden_size, activation=activation, kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2)))
                        model.add(layers.Dense(1, activation='sigmoid'))

                        model.compile(optimizer='adam', metrics=['accuracy'], loss='binary_crossentropy')
                        history = model.fit(Xtrain, ytrain, epochs=epochs, batch_size=batch_size, validation_data=(Xval, yval), verbose=0, callbacks=[early_stopping])
                        ypred = model.predict(Xval)
                                            
                        max_val_acc.append(max(history.history['val_acc']))
                        max_auc_score.append(roc_auc_score(yval, ypred))
                        params.append([hidden_size, activation, l1, l2, epochs, batch_size])

In [None]:
max_val_acc

In [None]:
max_auc_score

In [None]:
max(max_auc_score)

In [None]:
best_params = params[np.argmax(max_auc_score)]

model = models.Sequential()
model.add(layers.Dense(best_params[0], activation=best_params[1], kernel_regularizer=regularizers.l1_l2(l1=best_params[2], l2=best_params[3])))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', metrics=['accuracy'], loss='binary_crossentropy')
history = model.fit(Xtrain, ytrain, epochs=best_params[4], batch_size=best_params[5], validation_data=(Xval, yval), callbacks=[early_stopping])
ypred = model.predict(Xval)
roc_auc_score(yval, ypred)