<a href="https://colab.research.google.com/github/nchaudh03/AML-HCDR/blob/master/DS%20PROJ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, _fit_transform_one, _transform_one
from sklearn.preprocessing import OneHotEncoder, Imputer
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
from sklearn.externals.joblib import Parallel, delayed
from scipy import sparse
import numpy as np

#os.chdir(r'C:\Users\naimesh.chaudhari\Downloads')
#df  = pd.read_csv('previous_application.csv')

In [0]:
#from google.colab import drive
#drive.mount('/content/drive')
df = pd.read_csv(r'/content/drive/My Drive/HCDR Project/bureau_balance.csv')

In [0]:
class ColumnExtractor(TransformerMixin):
    def __init__(self, typ = "num",group = "" ):
        self.typ = typ
        self.group = group
    def fit(self, X, y=None):
        return self
    def transform (self,X):
        cat  = []
        num = []
        Xs  = X
        for col in X.columns:
            if Xs[col].dtypes == 'object':
                cat.append(col)
            else:
                num.append(col)
        if self.group != "":
            if self.group in num:
                cat.append(self.group)
            else:
                num.append(self.group)
        if self.typ == 'num':
            final = Xs[num]
        else:
            final = Xs[cat]
        return final
    

In [0]:
class NumericAggeregator(TransformerMixin):
    def __init__(self,group="",drop=""):
        self.group = group
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xs = X
        if self.drop in Xs.columns:
            Xs = Xs.drop(self.drop,axis = 1)
        Xs = Xs.groupby(self.group).agg([np.sum, np.mean,max,min])  
        Xs.columns = Xs.columns.map('_'.join)
        return Xs
   

In [0]:
#might need an encoder class before an aggeregator class
class CatAggeregator(TransformerMixin):
    def __init__(self, group="",drop=""):
        self.group = group
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xs  = X
        if self.drop in Xs.columns:
            Xs.drop(self.drop,axis = 1, inplace= True)
        Xs = Xs.groupby(self.group).agg([np.mean])  
        Xs.columns = Xs.columns.map('_'.join)
        return Xs

In [0]:

class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [0]:
class dfSimpleImputer(TransformerMixin):
    def __init__(self, missing_values=np.nan,strategy='constant'):
        self.missing_values = missing_values
        self.strategy  = strategy
    def fit(self, X):
        self.imp = SimpleImputer(missing_values=np.nan, strategy='constant').fit(X)
        return self
    def transform(self , X):
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns)
        return final
    def fit_transform(self, X, y= None):
        self.imp = SimpleImputer(missing_values=np.nan, strategy='constant').fit(X)
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns)
        return final
        

In [0]:
class dfOneHotEncoder(TransformerMixin):
    def fit(self, X):
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        return self
    
    def transform(self , X):
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
    
    def fit_transform(self, X, y= None ):
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
        

In [0]:
class dfOneHotEncoder(TransformerMixin):
    def fit(self, X):
        self.imp = ColumnTransformer([('tst',OneHotEncoder(handle_unknown = 'ignore'),[0,1,2,3,4,5,6,7,8,9])], remainder='passthrough').fit(X)
        return self
    
    def transform(self , X):
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
    
    def fit_transform(self, X, y= None ):
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        return final
        

In [0]:
#Pipeline for previous Application
Npipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'num',group ='SK_ID_CURR')),
                          ('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('agg', NumericAggeregator(group = 'SK_ID_CURR',drop = 'SK_ID_PREV'))])


Cpipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'cat', group ='SK_ID_CURR' )),
                          #('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('agg', CatAggeregator(group = 'SK_ID_CURR'))])

combined_features_prevapp = PandasFeatureUnion([("num", Npipe_prevapp),
                                                ("cat", Cpipe_prevapp)])

#Npipe_prevapp.fit_transform(df)

#agg_prev_app = combined_features_prevapp.fit_transform(df)
#agg_p

In [25]:
tst = ColumnExtractor(typ  = 'num', group ='SK_ID_BUREAU' ).fit_transform(df)
tst2 = dfSimpleImputer(missing_values=np.nan, strategy='constant').fit_transform(tst)
NumericAggeregator(group = 'SK_ID_BUREAU',drop = '').fit_transform(tst2)

Unnamed: 0_level_0,MONTHS_BALANCE_sum,MONTHS_BALANCE_mean,MONTHS_BALANCE_max,MONTHS_BALANCE_min
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5001709,-4656,-48.0,0,-96
5001710,-3403,-41.0,0,-82
5001711,-6,-1.5,0,-3
5001712,-171,-9.0,0,-18
5001713,-231,-10.5,0,-21
5001714,-105,-7.0,0,-14
5001715,-1770,-29.5,0,-59
5001716,-3655,-42.5,0,-85
5001717,-231,-10.5,0,-21
5001718,-741,-19.0,0,-38


In [23]:
tst = ColumnExtractor(typ  = 'cat', group ='SK_ID_CURR' ).fit_transform(df)
tst2 = dfSimpleImputer(missing_values=np.nan, strategy='constant').fit_transform(tst.iloc[0:13])
tst3 = dfOneHotEncoder().fit_transform(tst2)
tst3



#lst  = [True] * (len(tst2.columns)-1)+ [False]

#st
#OneHotEncoder(handle_unknown = 'ignore', categorical_features=np.array(lst)).fit_transform(tst2)
#tst3

#NumericAggeregator().fit_transform(tst3)


#CatAggeregator(group = 'SK_ID_CURR').fit_transform(tst2)

Unnamed: 0,x0_Cash loans,x0_Consumer loans,x1_FRIDAY,x1_MONDAY,x1_SATURDAY,x1_THURSDAY,x1_TUESDAY,x2_Y,x3_Everyday expenses,x3_Repairs,...,x16_175704,x16_176158,x16_199383,x16_202054,x16_258628,x16_271877,x16_296299,x16_334349,x16_342292,x16_447712
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [0]:
tst2.select_dtypes(include = [object])
#OneHotEncoder(handle_unknown = 'ignore', categorical_features= np.array([1])).fit(tst2)

Unnamed: 0,NAME_CONTRACT_TYPE,WEEKDAY_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,NAME_SELLER_INDUSTRY,NAME_YIELD_GROUP,PRODUCT_COMBINATION,SK_ID_CURR
0,Consumer loans,SATURDAY,Y,XAP,Approved,Cash through the bank,XAP,missing_value,Repeater,Mobile,POS,XNA,Country-wide,Connectivity,middle,POS mobile with interest,271877
1,Cash loans,THURSDAY,Y,XNA,Approved,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,XNA,low_action,Cash X-Sell: low,108129
2,Cash loans,TUESDAY,Y,XNA,Approved,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,XNA,high,Cash X-Sell: high,122040
3,Cash loans,MONDAY,Y,XNA,Approved,Cash through the bank,XAP,missing_value,Repeater,XNA,Cash,x-sell,Credit and cash offices,XNA,middle,Cash X-Sell: middle,176158
4,Cash loans,THURSDAY,Y,Repairs,Refused,Cash through the bank,HC,missing_value,Repeater,XNA,Cash,walk-in,Credit and cash offices,XNA,high,Cash Street: high,202054
5,Cash loans,SATURDAY,Y,Everyday expenses,Approved,Cash through the bank,XAP,Family,Repeater,XNA,Cash,x-sell,Credit and cash offices,XNA,low_normal,Cash X-Sell: low,199383
6,Cash loans,TUESDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,175704
7,Cash loans,MONDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,296299
8,Cash loans,MONDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,342292
9,Cash loans,SATURDAY,Y,XNA,Canceled,XNA,XAP,missing_value,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,334349
