In [167]:
import pandas as pd
import numpy as np
import os
from abc import abstractmethod
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn import metrics
os.chdir('D:/工作/data/titanic')
df = pd.read_csv('train.csv')

In [215]:
class PreprocessME:
    '''特征处理'''

    def fillcat(self, X):   # 使用None填补类别特征的缺失值。
        df_cat = X.copy()
        for feature in df_cat.columns:
            if df_cat[feature].dtype == 'object':
                df_cat[feature] = df_cat[feature].fillna('None')
            else:
                df_cat[feature] = df_cat[feature].fillna(np.ceil(df_cat[feature].max())+1)
        return df_cat

    def fillnum(self, X, method='mean'):  # 填补数值特征的缺失值。
        X = X.copy()
        if method == 'mean':
            X = X.fillna(X.mean())
        elif method == 'median':
            X = X.fillna(X.median())
        elif method == 'mode':
            X = X.fillna(X.mode().iloc[0])
        return X

    @abstractmethod
    def trainpreprocess(self, df_cat, df_num, **kwargs):
        '''训练集的特征处理方式，子类实现。'''
        pass
    
    @abstractmethod
    def testpreprocess(self, df_cat, df_num, **kwargs):
        '''测试集的特征处理方式，子类实现'''
        pass

    def select_cat(self, X, columns=None):
        '''选取类别特征，默认所有的object类型的特征为类别，也可以自定义一些特征
        作为类别特征。'''
        result = X.select_dtypes(include='object')
        cat = result.columns
        if columns is not None:
            cat = cat | set(columns)
        return cat

    def delete_columns(self, X, columns=None):
        '''删除不满足条件的特征，这些特征将不用于后续的处理和建模。'''
        result = X.copy()
        if columns is not None:
            result = result.drop(columns, axis=1)
        return result
    
    def get_cat_num(self, X, drop_col=None, cat_col=None):
        '''将数据集的特征分为数值特征和类别特征，在不同的模型中采用不同的处理方式，
        默认情况下所有的object类型都为类别特征，所有int和float类型为数值特征，用户
        可以自身知识设置cat_col将一部分数值特征加入到类别特征中'''
        result = self.delete_columns(X, drop_col)
        cat_col = self.select_cat(result, cat_col)
        num_col = result.columns.difference(cat_col)
        return num_col, cat_col
    
    def get_train_data(self, X, y, drop_col=None, cat_col=None):
        self.num_col, self.cat_col = self.get_cat_num(X, drop_col, cat_col)
        df_num, df_cat = X.loc[:, self.num_col], X.loc[:, self.cat_col]
        Xtrain = self.trainpreprocess(df_cat, df_num)
        return Xtrain


    def get_test_data(self, X):
        '''得到最终用于建模的数据。'''
        if hasattr(self, 'num_col') and hasattr(self, 'cat_col'):
            df_cat = X.loc[:, self.cat_col].copy()
            df_num = X.loc[:, self.num_col].copy()
            Xtrain = self.testpreprocess(df_cat, df_num)
            return Xtrain
        else:
            raise ValueError('模型没有训练，不能用于测试！')


class LGBPreprocess(PreprocessME):

    def labelenc(self, df_cat):
        df_cat = self.fillcat(df_cat)
        result = df_cat.copy()
        encs = {}
        for feature in df_cat.columns:
            enc = LabelEncoder()
            enc.fit(df_cat[feature])
            encs[feature] = enc
            result[feature] = enc.transform(df_cat[feature])
        return encs, result

    def trainpreprocess(self, df_cat, df_num):
        self.encs, result = self.labelenc(df_cat)
        result = pd.concat([result, df_num], axis=1)
        return result

    def testpreprocess(self, df_cat, df_num):
        df_cat = self.fillcat(df_cat)
        cat = df_cat.copy()
        assert hasattr(self, 'encs')
        for feature in self.encs.keys():
            cat[feature] = self.encs[feature].transform(df_cat[feature])
        result = pd.concat([cat, df_num], axis=1)
        return result


class LGBOnehot(PreprocessME):
    def onehotenc(self, df_cat):
        df_cat = self.fillcat(df_cat)
        encs = {}
        for feature in df_cat.columns:
            enc = OneHotEncoder(categories='auto', handle_unknown='ignore')
            enc.fit(df_cat.loc[:,[feature]])
            encs[feature] = enc
        return encs

    def onehottrans(self, df_cat, encs):
        df_cat = self.fillcat(df_cat)
        result = []
        for feature in df_cat.columns:
            enc = encs[feature]
            data = enc.transform(df_cat.loc[:,[feature]]).toarray()
            columns = enc.get_feature_names()
            func = partial(re.sub, 'x0', feature)
            columns = list(map(func, columns))
            res = pd.DataFrame(data, columns=columns, index=df_cat.index)
            result.append(res)
        result = pd.concat(result, axis=1)
        return result

    def trainpreprocess(self, df_cat, df_num):
        self.encs = self.onehotenc(df_cat)
        result = self.onehottrans(df_cat, self.encs)
        result = pd.concat([result, df_num], axis=1)
        return result

    def testpreprocess(self, df_cat, df_num):
        df_cat = self.fillcat(df_cat)
        assert hasattr(self, 'encs')
        result = self.onehottrans(df_cat, self.encs)
        result = pd.concat([result, df_num], axis=1)
        return result


In [169]:
drop =['Name','Ticket','PassengerId']

In [216]:
lgpre = LGBOnehot()

In [170]:
y = df.pop('Survived')

In [220]:
Z = lgpre.get_train_data(df,y,drop_col=drop,cat_col=['Pclass','SibSp'])

In [223]:
lgpre.fillcat(df.loc[:,lgpre.cat_col])

Unnamed: 0,Cabin,Embarked,Pclass,Sex,SibSp
0,,S,3,male,1
1,C85,C,1,female,1
2,,S,3,female,0
3,C123,S,1,female,1
4,,S,3,male,0
...,...,...,...,...,...
886,,S,2,male,0
887,B42,S,1,female,0
888,,S,3,female,1
889,C148,C,1,male,0


In [10]:
cat = ['Sex','Cabin','Embarked']

In [71]:
import lightgbm as lgb
from sklearn.metrics import roc_curve
from sklearn import clone

In [147]:
est = lgb.LGBMClassifier(boosting_type='gbdt')

In [35]:
ypred = est.predict_proba(X,num_iteration=50,pred_contrib=)

In [17]:
from sklearn.metrics import roc_auc_score

In [36]:
roc_auc_score(y, ypred[:,1])

0.9751461988304093