In [1]:
import pandas as pd
import numpy as np
import os
from abc import abstractmethod
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn import metrics
os.chdir('D:/工作/data/titanic')
df = pd.read_csv('train.csv')

In [14]:
class PreprocessME:
    '''特征处理,对类别特征进行编码，使得可以进行后续建模'''

    @abstractmethod
    def trainpreprocess(self, df_cat, df_num, **kwargs):
        '''训练集的特征处理方式，子类实现。'''
        pass
    
    @abstractmethod
    def testpreprocess(self, df_cat, df_num, **kwargs):
        '''测试集的特征处理方式，子类实现'''
        pass

    def select_cat(self, X, columns=None):
        '''选取类别特征，默认所有的object类型的特征为类别，也可以自定义一些特征
        作为类别特征。'''
        result = X.select_dtypes(include='object')
        cat = result.columns
        if columns is not None:
            cat = cat | set(columns)
        return cat

    def delete_columns(self, X, columns=None):
        '''删除不满足条件的特征，这些特征将不用于后续的处理和建模。'''
        result = X.copy()
        if columns is not None:
            result = result.drop(columns, axis=1)
        return result
    
    def get_cat_num(self, X, drop_col=None, cat_col=None):
        '''将数据集的特征分为数值特征和类别特征，在不同的模型中采用不同的处理方式，
        默认情况下所有的object类型都为类别特征，所有int和float类型为数值特征，用户
        可以自身知识设置cat_col将一部分数值特征加入到类别特征中'''
        result = self.delete_columns(X, drop_col)
        cat_col = self.select_cat(result, cat_col)
        num_col = result.columns.difference(cat_col)
        return num_col, cat_col
    
    def get_train_data(self, X, y, drop_col=None, cat_col=None):
        self.num_col, self.cat_col = self.get_cat_num(X, drop_col, cat_col)
        df_num, df_cat = X.loc[:, self.num_col], X.loc[:, self.cat_col]
        Xtrain = self.trainpreprocess(df_cat, df_num)
        return Xtrain

    def get_test_data(self, X):
        '''得到最终用于建模的数据。'''
        if hasattr(self, 'num_col') and hasattr(self, 'cat_col'):
            df_cat = X.loc[:, self.cat_col].copy()
            df_num = X.loc[:, self.num_col].copy()
            Xtrain = self.testpreprocess(df_cat, df_num)
            return Xtrain
        else:
            raise ValueError('模型没有训练，不能用于测试！')


class ProcessMethod:

    def fillcat(self, X):   # 使用None填补类别特征的缺失值。
        df_cat = X.copy()
        for feature in df_cat.columns:
            if df_cat[feature].dtype == 'object':
                df_cat[feature] = df_cat[feature].fillna('None')
            else:
                df_cat[feature] = df_cat[feature].fillna(np.ceil(df_cat[feature].max())+1)
        return df_cat

    def fillnum(self, X, method='mean'):  # 填补数值特征的缺失值。
        X = X.copy()
        if method == 'mean':
            X = X.fillna(X.mean())
        elif method == 'median':
            X = X.fillna(X.median())
        elif method == 'mode':
            X = X.fillna(X.mode().iloc[0])
        return X

    def labelenc(self, df_cat):
        '''自然数编码。'''
        df_cat = self.fillcat(df_cat)
        result = df_cat.copy()
        encs = {}
        for feature in df_cat.columns:
            enc = LabelEncoder()
            enc.fit(df_cat[feature])
            encs[feature] = enc
            result[feature] = enc.transform(df_cat[feature])
        return encs, result
    
    def onehotenc(self, df_cat):
        df_cat = self.fillcat(df_cat)
        encs = {}
        for feature in df_cat.columns:
            enc = OneHotEncoder(categories='auto', handle_unknown='ignore')
            enc.fit(df_cat.loc[:,[feature]])
            encs[feature] = enc
        return encs

    def onehottrans(self, df_cat, encs):
        df_cat = self.fillcat(df_cat)
        result = []
        for feature in df_cat.columns:
            enc = encs[feature]
            data = enc.transform(df_cat.loc[:,[feature]]).toarray()
            columns = enc.get_feature_names()
            func = partial(re.sub, 'x0', feature)
            columns = list(map(func, columns))
            res = pd.DataFrame(data, columns=columns, index=df_cat.index)
            result.append(res)
        result = pd.concat(result, axis=1)
        return result

    
class LGBPreprocess(PreprocessME, ProcessMethod):

    def trainpreprocess(self, df_cat, df_num):
        self.encs, result = self.labelenc(df_cat)
        result = pd.concat([result, df_num], axis=1)
        return result

    def testpreprocess(self, df_cat, df_num):
        df_cat = self.fillcat(df_cat)
        cat = df_cat.copy()
        assert hasattr(self, 'encs')
        for feature in self.encs.keys():
            cat[feature] = self.encs[feature].transform(df_cat[feature])
        result = pd.concat([cat, df_num], axis=1)
        return result


class LGBOnehot(PreprocessME, ProcessMethod):

    def trainpreprocess(self, df_cat, df_num):
        self.encs = self.onehotenc(df_cat)
        result = self.onehottrans(df_cat, self.encs)
        result = pd.concat([result, df_num], axis=1)
        return result

    def testpreprocess(self, df_cat, df_num):
        assert hasattr(self, 'encs')
        result = self.onehottrans(df_cat, self.encs)
        result = pd.concat([result, df_num], axis=1)
        return result

In [3]:
drop =['Name','Ticket','PassengerId']

In [15]:
lgpre = LGBOnehot()

In [5]:
y = df.pop('Survived')

In [18]:
from functools import partial 
import re
Z = lgpre.get_train_data(df,y,drop_col=drop)

In [96]:
est = xgb.XGBClassifier()
params = est.get_xgb_params()

In [97]:
params.pop('n_estimators')
params['eval_metric'] = 'auc'

In [98]:
data = xgb.DMatrix(Z, y)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [99]:
def get_label(data):
    label = data.get_label()
    return pd.Series(label) 
def _ks(y, preds):
    fpr, tpr, _ = roc_curve(y, preds)
    return np.abs(fpr-tpr).max()
def ks(preds, data):
    y = get_label(data)
    res = _ks(y, preds)
    return ('KS', res)
dic = {}
res = xgb.train(params,data,feval=ks,evals=[(data,'train')],
                evals_result=dic)

[0]	train-auc:0.858656	train-KS:0.60929
[1]	train-auc:0.862874	train-KS:0.629134
[2]	train-auc:0.863002	train-KS:0.629134
[3]	train-auc:0.865889	train-KS:0.629134
[4]	train-auc:0.866951	train-KS:0.631004
[5]	train-auc:0.866781	train-KS:0.631004
[6]	train-auc:0.868472	train-KS:0.631004
[7]	train-auc:0.868663	train-KS:0.631004
[8]	train-auc:0.877782	train-KS:0.649986
[9]	train-auc:0.878359	train-KS:0.649986


In [100]:
res.predict(Z,ntree_limit=6)

AttributeError: 'DataFrame' object has no attribute 'feature_names'

In [80]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [75]:
temp['KS'].idxmax()

8

In [78]:
temp.idxmax().to_dict()

{'auc': 9, 'KS': 8}

In [82]:
est2 = lgb.LGBMClassifier()
params = est2.get_params()

In [83]:
data = lgb.Dataset(Z, y)
params.pop('n_estimators')
params['metric'] = 'auc'

In [85]:
res = {}
result = lgb.train(params, data, num_boost_round=200,
        valid_sets=[data], valid_names=['train'], verbose_eval=False, evals_result=res)

In [21]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_curve
from sklearn import clone

In [38]:
est = lgb.LGBMClassifier()

In [39]:
est.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}