In [14]:
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, _fit_transform_one, _transform_one
from sklearn.preprocessing import OneHotEncoder, Imputer
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
from sklearn.externals.joblib import Parallel, delayed
from scipy import sparse
import numpy as np

os.chdir(r'C:\Users\naimesh.chaudhari\Downloads')
df  = pd.read_csv('previous_application.csv')

In [111]:
class ColumnExtractor(TransformerMixin):
    def __init__(self, typ = "num",group = "" ):
        self.typ = typ
        self.group = group
    def fit(self, X, y=None):
        return self
    def transform (self,X):
        cat  = []
        num = []
        Xs  = X
        for col in X.columns:
            if Xs[col].dtypes == 'object':
                cat.append(col)
            else:
                num.append(col)
        if self.group != "":
            if self.group in num:
                cat.append(self.group)
            else:
                num.append(self.group)
        if self.typ == 'num':
            final = Xs[num]
        else:
            final = Xs[cat]
        return final
    

In [16]:
class NumericAggeregator(TransformerMixin):
    def __init__(self,group="",drop=""):
        self.group = group
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xs = X
        if self.drop in Xs.columns:
            Xs = Xs.drop(self.drop,axis = 1)
        Xs = Xs.groupby(self.group).agg([np.sum, np.mean,max,min])  
        Xs.columns = Xs.columns.map('_'.join)
        return Xs
   

In [17]:
#might need an encoder class before an aggeregator class
class CatAggeregator(TransformerMixin):
    def __init__(self, group="",drop=""):
        self.group = group
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xs  = X
        if self.drop in Xs.columns:
            Xs.drop(self.drop,axis = 1, inplace= True)
        Xs = Xs.groupby(self.group).agg([np.mean])  
        Xs.columns = Xs.columns.map('_'.join)
        return Xs

In [18]:

class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [19]:
class dfSimpleImputer(TransformerMixin):
    def __init__(self, missing_values=np.nan,strategy='constant'):
        self.missing_values = missing_values
        self.strategy  = strategy
    def fit(self, X):
        self.imp = SimpleImputer(missing_values=np.nan, strategy='constant').fit(X)
        return self
    def transform(self , X):
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns)
        return final
    def fit_transform(self, X, y= None):
        self.imp = SimpleImputer(missing_values=np.nan, strategy='constant').fit(X)
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns)
        return final
        

In [94]:
class dfOneHotEncoder(TransformerMixin):
    def fit(self, X):
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        return self
    
    def transform(self , X):
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
    
    def fit_transform(self, X, y= None ):
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
        

In [140]:
class dfOneHotEncoder(TransformerMixin):
    def fit(self, X):
        self.imp = ColumnTransformer([('tst',OneHotEncoder(handle_unknown = 'ignore'),[0:15])
                                           , remainder='passthrough']).fit(X)
        return self
    
    def transform(self , X):
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
    
    def fit_transform(self, X, y= None ):
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
        

SyntaxError: invalid syntax (<ipython-input-140-5af1c479e3e0>, line 3)

In [143]:
np.array(0,15)

TypeError: data type not understood

In [113]:
#Pipeline for previous Application
df = pd.read_csv('previous_application.csv')
Npipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'num',group ='SK_ID_CURR')),
                          ('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('agg', NumericAggeregator(group = 'SK_ID_CURR',drop = 'SK_ID_PREV'))])

Cpipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'cat', group ='SK_ID_CURR' )),
                          #('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('agg', CatAggeregator(group = 'SK_ID_CURR'))])

combined_features_prevapp = PandasFeatureUnion([("num", Npipe_prevapp),
                                                ("cat", Cpipe_prevapp)])

#Npipe_prevapp.fit_transform(df)

#agg_prev_app = combined_features_prevapp.fit_transform(df)
#agg_p

In [39]:
tst = ColumnExtractor(typ  = 'num', group ='SK_ID_CURR' ).fit_transform(df)
tst2 = dfSimpleImputer(missing_values=np.nan, strategy='constant').fit_transform(tst)
NumericAggeregator(group = 'SK_ID_CURR',drop = 'SK_ID_PREV').fit_transform(tst2)

Unnamed: 0_level_0,AMT_ANNUITY_sum,AMT_ANNUITY_mean,AMT_ANNUITY_max,AMT_ANNUITY_min,AMT_APPLICATION_sum,AMT_APPLICATION_mean,AMT_APPLICATION_max,AMT_APPLICATION_min,AMT_CREDIT_sum,AMT_CREDIT_mean,...,DAYS_LAST_DUE_max,DAYS_LAST_DUE_min,DAYS_TERMINATION_sum,DAYS_TERMINATION_mean,DAYS_TERMINATION_max,DAYS_TERMINATION_min,NFLAG_INSURED_ON_APPROVAL_sum,NFLAG_INSURED_ON_APPROVAL_mean,NFLAG_INSURED_ON_APPROVAL_max,NFLAG_INSURED_ON_APPROVAL_min
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001.0,3951.000,3951.000000,3951.000,3951.000,24835.500,24835.500000,24835.5,24835.500,23787.000,23787.000000,...,-1619.0,-1619.0,-1612.0,-1612.000000,-1612.0,-1612.0,0.0,0.000000,0.0,0.0
100002.0,9251.775,9251.775000,9251.775,9251.775,179055.000,179055.000000,179055.0,179055.000,179055.000,179055.000000,...,-25.0,-25.0,-17.0,-17.000000,-17.0,-17.0,0.0,0.000000,0.0,0.0
100003.0,169661.970,56553.990000,98356.995,6737.310,1306309.500,435436.500000,900000.0,68809.500,1452573.000,484191.000000,...,-536.0,-1980.0,-3142.0,-1047.333333,-527.0,-1976.0,2.0,0.666667,1.0,0.0
100004.0,5357.250,5357.250000,5357.250,5357.250,24282.000,24282.000000,24282.0,24282.000,20106.000,20106.000000,...,-724.0,-724.0,-714.0,-714.000000,-714.0,-714.0,0.0,0.000000,0.0,0.0
100005.0,4813.200,2406.600000,4813.200,0.000,44617.500,22308.750000,44617.5,0.000,40153.500,20076.750000,...,0.0,-466.0,-460.0,-230.000000,0.0,-460.0,0.0,0.000000,0.0,0.0
100006.0,141907.050,15767.450000,39954.510,0.000,2449829.340,272203.260000,688500.0,0.000,2625259.500,291695.500000,...,365243.0,-425.0,729927.0,81103.000000,365243.0,-416.0,0.0,0.000000,0.0,0.0
100007.0,73672.830,12278.805000,22678.785,1834.290,903181.500,150530.250000,247500.0,17176.500,999832.500,166638.750000,...,365243.0,-2056.0,360719.0,60119.833333,365243.0,-2041.0,3.0,0.500000,1.0,0.0
100008.0,63358.785,12671.757000,25309.575,0.000,778509.000,155701.800000,450000.0,0.000,813838.500,162767.700000,...,0.0,-2341.0,-3491.0,-698.200000,0.0,-2334.0,1.0,0.200000,1.0,0.0
100009.0,70359.885,10051.412143,17341.605,7435.845,537192.000,76741.714286,110160.0,40455.000,490963.500,70137.642857,...,365243.0,-1330.0,361710.0,51672.857143,365243.0,-1323.0,0.0,0.000000,0.0,0.0
100010.0,27463.410,27463.410000,27463.410,27463.410,247212.000,247212.000000,247212.0,247212.000,260811.000,260811.000000,...,-769.0,-769.0,-762.0,-762.000000,-762.0,-762.0,0.0,0.000000,0.0,0.0


In [139]:
tst = ColumnExtractor(typ  = 'cat', group ='SK_ID_CURR' ).fit_transform(df)
tst2 = dfSimpleImputer(missing_values=np.nan, strategy='constant').fit_transform(tst)
tst3 = dfOneHotEncoder().fit_transform(tst2.iloc[:,0:15])
tst3



#lst  = [True] * (len(tst2.columns)-1)+ [False]

#st
#OneHotEncoder(handle_unknown = 'ignore', categorical_features=np.array(lst)).fit_transform(tst2)
#tst3

#NumericAggeregator().fit_transform(tst3)


#CatAggeregator(group = 'SK_ID_CURR').fit_transform(tst2)

Unnamed: 0,x0_Cash loans,x0_Consumer loans,x0_Revolving loans,x0_XNA,x1_FRIDAY,x1_MONDAY,x1_SATURDAY,x1_SUNDAY,x1_THURSDAY,x1_TUESDAY,...,x13_Industry,x13_Jewelry,x13_MLM partners,x13_Tourism,x13_XNA,x14_XNA,x14_high,x14_low_action,x14_low_normal,x14_middle
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [137]:
tst2.select_dtypes(include = [object])
#OneHotEncoder(handle_unknown = 'ignore', categorical_features= np.array([1])).fit(tst2)

Unnamed: 0,NAME_CONTRACT_TYPE,WEEKDAY_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,NAME_SELLER_INDUSTRY,NAME_YIELD_GROUP,PRODUCT_COMBINATION,SK_ID_CURR
0,Consumer loans,SATURDAY,Y,XAP,Approved,Cash through the bank,XAP,missing_value,Repeater,Mobile,POS,XNA,Country-wide,Connectivity,middle,POS mobile with interest,271877
1,Cash loans,THURSDAY,Y,XNA,Approved,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,XNA,low_action,Cash X-Sell: low,108129
2,Cash loans,TUESDAY,Y,XNA,Approved,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,XNA,high,Cash X-Sell: high,122040
3,Cash loans,MONDAY,Y,XNA,Approved,Cash through the bank,XAP,missing_value,Repeater,XNA,Cash,x-sell,Credit and cash offices,XNA,middle,Cash X-Sell: middle,176158
4,Cash loans,THURSDAY,Y,Repairs,Refused,Cash through the bank,HC,missing_value,Repeater,XNA,Cash,walk-in,Credit and cash offices,XNA,high,Cash Street: high,202054
5,Cash loans,SATURDAY,Y,Everyday expenses,Approved,Cash through the bank,XAP,Family,Repeater,XNA,Cash,x-sell,Credit and cash offices,XNA,low_normal,Cash X-Sell: low,199383
6,Cash loans,TUESDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,175704
7,Cash loans,MONDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,296299
8,Cash loans,MONDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,342292
9,Cash loans,SATURDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,334349
