In [3]:
import pandas as pd
import numpy as np
import os
from abc import abstractmethod
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn import metrics
os.chdir('D:/工作/data/titanic')
df = pd.read_csv('train.csv')

In [26]:
class PreprocessME:
    '''特征处理'''
    @abstractmethod
    def trainpreprocess(self, df_cat, df_num, **kwargs):
        pass
    
    @abstractmethod
    def testpreprocess(self, df_cat, df_num, **kwargs):
        pass
    
    def fillcat(self, X):
        df_cat = X.copy()
        if df_cat.isnull().any().any():
            df_cat = df_cat.fillna('None')
        return df_cat

    def set_params(self, **kwargs):
        '''设定模型的参数，用于用户自定义模型参数。'''
        self.est.set_params(**kwargs)

    def select_cat(self, X, columns=None):
        '''选取类别特征，默认所有的object类型的特征为类别，也可以自定义一些特征
        作为类别特征。'''
        result = X.select_dtypes(include='object')
        cat = result.columns
        if columns is not None:
            cat = result.columns|set(columns)
        return cat

    def delete_columns(self, X, columns=None):
        '''删除不满足条件的特征，这些特征将不用于后续的处理和建模。'''
        result = X.copy()
        if columns is not None:
            result = result.drop(columns, axis=1)
        return result
    
    def get_cat_num(self, X, drop_col=None, cat_col=None):
        '''将数据集的特征分为数值特征和类别特征，在不同的模型中采用不同的处理方式，
        默认情况下所有的object类型都为类别特征，所有int和float类型为数值特征，用户
        可以自身知识设置cat_col将一部分数值特征加入到类别特征中'''
        result = self.delete_columns(X, drop_col)
        cat_col = self.select_cat(result, cat_col)
        num_col = result.columns.difference(cat_col)
        return num_col, cat_col

    def get_train(self, X, y=None, drop_col=None, cat_col=None):
        '''得到最终用于建模的数据。'''
        if y is not None:
            self.num_col, self.cat_col = self.get_cat_num(X, drop_col, cat_col)
            df_num, df_cat = X.loc[:, self.num_col], X.loc[:,self.cat_col]
            Xtrain = self.trainpreprocess(df_cat, df_num)
        else:
            if hasattr(self, 'num_col') and hasattr(self, 'cat_col'):
                df_cat = X.loc[:, self.cat_col].copy()
                df_num = X.loc[:, self.num_col].copy()
                Xtrain = self.testpreprocess(df_cat, df_num)
            else:
                raise ValueError('模型没有训练，不能用于测试！')
        return Xtrain

class LGBPreprocess(PreprocessME):
    def labelenc(self, df_cat):
        df_cat = self.fillcat(df_cat)
        result = df_cat.copy()
        encs = {}
        for feature in df_cat.columns:
            enc = LabelEncoder()
            enc.fit(df_cat[feature])
            encs[feature] = enc
            result[feature] = enc.transform(df_cat[feature])
        return encs, result

    def trainpreprocess(self, df_cat, df_num):
        self.encs, result = self.labelenc(df_cat)
        result = pd.concat([result, df_num], axis=1)
        return result
    
    def testpreprocess(self, df_cat, df_num):
        df_cat = self.fillcat(df_cat)
        cat = df_cat.copy()
        assert hasattr(self, 'encs')
        for feature in self.encs.keys():
            cat[feature] = self.encs[feature].transform(df_cat[feature])
        result = pd.concat([cat, df_num], axis=1)
        return result

In [41]:
import xgboost as xgb

In [42]:
est = xgb.XGBClassifier()

In [43]:
est.set_params(**{'n_estimators':200})

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [46]:
a = {}
a['a']

KeyError: 'a'