# 导入模块

In [2]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from toad.detector import detect
from toad.transform import Combiner
from toad.metrics import PSI, AUC, KS_bucket
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier

warnings.filterwarnings("ignore")
pd.set_option("display.width", 10000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
%matplotlib inline

# 加载数据

In [17]:
data = np.load("../data/phase1_gdata.npz")
features = pd.DataFrame(data["x"], columns=['x{}'.format(i) for i in range(1, 18)])
features["y"] = data['y']

# 数据预处理

## 提取训练集与测试集

In [35]:
colnames = ['mask', 'y', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17']

train = features[features["y"].isin([0, 1])]
train['mask'] = data['train_mask']
train = train[colnames]
train.index = range(len(train))

test = features[features["y"].isin([-100])]
test['mask'] = data['test_mask']
test = test[colnames]
test.index = range(len(test))

## 查看数据

### 训练集

In [36]:
train.head()

Unnamed: 0,mask,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17
0,452420,0,0.0,2.0,0.68,0.505,0.742647,0.7,0.319,0.015,0.281,0.880878,1.0,0.257353,0.119122,0.069307,0.857143,0.007353,0.142857
1,1935260,0,0.0,2.0,0.55,0.485,0.881818,0.5,0.326,0.045,0.313,0.960123,0.0,0.118182,0.039877,0.051546,0.6,-1.0,-1.0
2,1210972,0,0.0,2.0,0.055,0.055,1.0,-1.0,0.076,-1.0,0.076,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,2719604,0,0.0,0.0,2.835,2.53,0.892416,1.7,4.316,0.108,4.234,0.981001,2.0,0.107584,0.018999,0.033597,0.764706,0.003527,0.117647
4,1162821,0,0.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### 测试集

In [37]:
test.head()

Unnamed: 0,mask,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17
0,3804756,-100,0.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,690432,-100,0.0,3.0,3.16,3.02,0.955696,1.9,2.713,0.09,2.674,0.985625,2.0,0.044304,0.014375,0.031457,0.789474,0.003165,0.105263
2,1250623,-100,0.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3555130,-100,0.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,1228284,-100,1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## 基本属性

# 特征工程