<a href="https://colab.research.google.com/github/nchaudh03/AML-HCDR/blob/master/DS%20PROJ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, _fit_transform_one, _transform_one
from sklearn.preprocessing import OneHotEncoder, Imputer,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
from sklearn.externals.joblib import Parallel, delayed
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse
import numpy as np

#os.chdir(r'C:\Users\naimesh.chaudhari\Downloads')
#df  = pd.read_csv('previous_application.csv')

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
class ColumnExtractor(TransformerMixin):
    def __init__(self, typ = "num",group = "" ):
        self.typ = typ
        self.group = group
    def fit(self, X, y=None):
        return self
    def transform (self,X):
        cat  = []
        num = []
        Xs  = X
        for col in X.columns:
            if Xs[col].dtypes == 'object':
                cat.append(col)
            else:
                num.append(col)
        if self.group != "":
            if self.group in num:
                cat.append(self.group)
            else:
                num.append(self.group)
        if self.typ == 'num':
            final = Xs[num]
        else:
            final = Xs[cat]
        return final
    

In [0]:
class NumericAggeregator(TransformerMixin):
    def __init__(self,group="",drop=""):
        self.group = group
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xs = X
        if self.drop in Xs.columns:
            Xs = Xs.drop(self.drop,axis = 1)
        Xs = Xs.groupby(self.group).agg([np.sum, np.mean,max,min])  
        Xs.columns = Xs.columns.map('_'.join)
        return Xs
   

In [0]:
#might need an encoder class before an aggeregator class
class CatAggeregator(TransformerMixin):
    def __init__(self, group="",drop=""):
        self.group = group
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xs  = X
        if self.drop in Xs.columns:
            Xs.drop(self.drop,axis = 1, inplace= True)
        Xs = Xs.groupby(self.group).agg([np.mean])  
        Xs.columns = Xs.columns.map('_'.join)
        return Xs

In [0]:

class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [0]:
class dfSimpleImputer(TransformerMixin):
    def __init__(self, missing_values=np.nan,strategy='constant'):
        self.missing_values = missing_values
        self.strategy  = strategy
    def fit(self, X):
        self.imp = SimpleImputer(missing_values=np.nan, strategy='constant').fit(X)
        return self
    def transform(self , X):
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns)
        return final
    def fit_transform(self, X, y= None):
        self.imp = SimpleImputer(missing_values=np.nan, strategy='constant').fit(X)
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns)
        return final
        

In [0]:
class dfOneHotEncoder(TransformerMixin):
    def __init__(self, key = "", ind = ""):
        self.key = key
        self.ind = ind
    def fit(self, X):
        if self.key != "":
          self.ind = X[self.key]
          X.drop(self.key, inplace = True, axis = 1)
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        return self
    
    def transform(self , X):
        if self.key != "":
          self.ind = X[self.key]
          X.drop(self.key, inplace = True, axis = 1)
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        if self.key != "":
          final[self.key] = self.ind
        return final
    
    def fit_transform(self, X, y= None ):
        if self.key != "":
          self.ind = X[self.key]
          X.drop(self.key, inplace = True, axis = 1)
        self.imp = OneHotEncoder(handle_unknown = 'ignore').fit(X)
        dat = self.imp.transform(X).toarray()
        final = pd.DataFrame(dat, columns = self.imp.get_feature_names())
        if self.key != "":
          final[self.key] = self.ind
        return final
        

In [0]:
class dfStandardScaler(TransformerMixin):
    def fit(self, X, y=None):
        self.imp = StandardScaler().fit(X)
        return self
    def transform(self, X):
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns)
        return final
    def fit_transform(self, X, y=None):
        self.imp = StandardScaler().fit(X)
        dat = self.imp.transform(X)
        final = pd.DataFrame(dat, columns = X.columns, index = X.index)
        return final

In [11]:
#Pipeline for previous Application
prevapp = pd.read_csv(r'/content/drive/My Drive/HCDR Project/previous_application.csv')
Npipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'num',group ='SK_ID_CURR')),
                          ('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('agg', NumericAggeregator(group = 'SK_ID_CURR',drop = 'SK_ID_PREV')),
                          ('scl',dfStandardScaler())])

#Npipe_prevapp.fit_transform(prevapp)

Cpipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'cat', group ='SK_ID_CURR' )),
                          ('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('encode', dfOneHotEncoder(key ='SK_ID_CURR' )),
                          ('agg', CatAggeregator(group = 'SK_ID_CURR', drop = 'SK_ID_PREV'))])


#Cpipe_prevapp.fit_transform(prevapp)

combined_features_prevapp = PandasFeatureUnion([("num", Npipe_prevapp),
                                               ("cat", Cpipe_prevapp)])
agg_prev_app = combined_features_prevapp.fit_transform(prevapp)
agg_prev_app['SK_ID_CURR'] = agg_prev_app.index
agg_prev_app.head()

Unnamed: 0_level_0,AMT_ANNUITY_sum,AMT_ANNUITY_mean,AMT_ANNUITY_max,AMT_ANNUITY_min,AMT_APPLICATION_sum,AMT_APPLICATION_mean,AMT_APPLICATION_max,AMT_APPLICATION_min,AMT_CREDIT_sum,AMT_CREDIT_mean,...,x15_POS household with interest_mean,x15_POS household without interest_mean,x15_POS industry with interest_mean,x15_POS industry without interest_mean,x15_POS mobile with interest_mean,x15_POS mobile without interest_mean,x15_POS other with interest_mean,x15_POS others without interest_mean,x15_missing_value_mean,SK_ID_CURR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001.0,-0.774842,-0.931944,-1.035698,-0.09655,-0.619605,-0.849415,-0.835933,-0.205543,-0.630504,-0.888254,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,100001.0
100002.0,-0.702992,-0.326793,-0.760639,0.621209,-0.505698,0.161104,-0.47043,1.443758,-0.526673,0.052874,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,100002.0
100003.0,1.471304,5.073358,3.86306,0.280735,0.326898,1.841035,1.238222,0.264737,0.324954,1.902399,...,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,100003.0
100004.0,-0.755781,-0.771403,-0.962727,0.093866,-0.620014,-0.853041,-0.837245,-0.211463,-0.632965,-0.910566,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,100004.0
100005.0,-0.763155,-1.108257,-0.990958,-0.631541,-0.604994,-0.865971,-0.78905,-0.471147,-0.619559,-0.910743,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,100005.0


In [0]:

train = pd.read_csv(r'/content/drive/My Drive/HCDR Project/application_train.csv')
test = pd.read_csv(r'/content/drive/My Drive/HCDR Project/application_test.csv')




Y = train['TARGET']
train_ids = train['SK_ID_CURR']
test_ids = test['SK_ID_CURR']
test.drop(['SK_ID_CURR'], inplace = True, axis = 1)


train.drop(['TARGET','SK_ID_CURR'], inplace = True, axis = 1)
Npipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'num',group ='')),
                          ('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('scl',dfStandardScaler())])



Cpipe_prevapp = Pipeline([('cl,', ColumnExtractor(typ  = 'cat', group ='' )),
                          ('imp',dfSimpleImputer(missing_values=np.nan, strategy='constant')),
                          ('encode', dfOneHotEncoder())])

#tst = Npipe_prevapp.fit_transform(train)
#Cpipe_prevapp.fit_transform(train)

combined_features = PandasFeatureUnion([("num", Npipe_prevapp),
                                                ("cat", Cpipe_prevapp)])


trn = combined_features.fit_transform(train)
trn['SK_ID_CURR'] = train_ids
trn = trn.merge(agg_prev_app, how  = 'left', on = 'SK_ID_CURR')


tst = combined_features.fit_transform(test)
tst['SK_ID_CURR'] = test_ids
tst = tst.merge(agg_prev_app, how  = 'left', on = 'SK_ID_CURR')


trn.fillna(0, inplace = True)
tst.fillna(0, inplace = True)

tst.drop(['SK_ID_CURR'], axis = 1, inplace = True)
trn.drop(['SK_ID_CURR'], axis = 1, inplace = True)


In [24]:
print(len(trn.columns))
print(len(tst.columns))



471
468


In [15]:
model  = LogisticRegression()
model.fit(trn,Y)



pred = model.predict_proba(tst)[:,1]
sub = pd.DataFrame(test_ids)
sub['TARGET'] = pred
sub.columns = ['SK_ID_CURR','TARGET']

sub
#sub.to_csv('submission.csv', index = False)





ValueError: ignored