In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import pandas as pd
import numpy as np

In [5]:
df_p = pd.read_csv("../data/data.csv") 

In [6]:
## Train test split
Xp = df_p.drop(columns = ['Loan_Status', 'Loan_ID'])
yp = df_p['Loan_Status']
Xp_train, Xp_test, yp_train, yp_test = train_test_split(Xp, yp, test_size=0.3, random_state=84)

In [7]:
cat_features = ['Married', 'Dependents', 'Education', 'Credit_History', 'Property_Area']
num_features = ['Loan_Amount_Term', 'LoanAmount_log', 'Total_income_log']

In [8]:
model_rfc = RandomForestClassifier(max_depth = 10, max_features = 5, min_samples_leaf = 5, n_estimators = 100)

In [19]:
## Define class to add 'Total_income' column and drop applicant income and coapplicant income
class colTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['Total_income'] = X['ApplicantIncome'] + X['CoapplicantIncome']
        return X

In [26]:
_catfeats(df_p)

Unnamed: 0,Married,Dependents,Education,Credit_History,Property_Area
0,No,0,Graduate,1.0,Urban
1,Yes,1,Graduate,1.0,Rural
2,Yes,0,Graduate,1.0,Urban
3,Yes,0,Not Graduate,1.0,Urban
4,No,0,Graduate,1.0,Urban
...,...,...,...,...,...
609,No,0,Graduate,1.0,Rural
610,Yes,3+,Graduate,1.0,Rural
611,Yes,1,Graduate,1.0,Urban
612,Yes,2,Graduate,1.0,Urban


In [32]:
## First Let us prepare our features by defining our own functions
## Keep categorical features
def _catfeats(dataframe):
    df_cat = dataframe[cat_features]
    return dataframe[cat_features]
## Keep numerical features
def _numfeats(dataframe):
    df_num = dataframe[['LoanAmount', 'Loan_Amount_Term']]
    return df_num

In [11]:
# ## Define log transform function
def _logobject(X):
    X_log = np.log(X)
    return X_log

In [29]:
## Column transformer
logtransform = ColumnTransformer([('logtransform', _logobject, ['LoanAmount'])], 
                              remainder='passthrough')

In [35]:
## Apply FunctionTransformer to our custom functions
catfeats = FunctionTransformer(_catfeats)
numfeats = FunctionTransformer(_numfeats)


In [15]:
## Median and mode imputers
imp_median = SimpleImputer(missing_values = np.nan, strategy = 'median')
imp_mode = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

In [16]:
## One Hot Encoder
enc = OneHotEncoder(handle_unknown='ignore', sparse = False)

In [36]:
## Build our pipeline
numerical_features = Pipeline(steps = [
                                    
                                    ('keep_numeric_feats', numfeats), ## select numeric features
                                     
                                    ('impute_median', imp_median), ## Impute median
                                    ('log_transform', logtransform) ## Log transform the LoanAmount and Total Income columns
])

categorical_features = Pipeline(steps = [
                                        ('keep_categorical_feats', catfeats),
                                        ('impute_mode', imp_mode),
                                        ('one_hot_encode', enc)
])

## Merge features 
union = FeatureUnion([
                    ('numerical_features', numerical_features),
                    ('categorical_features', categorical_features)
])

## Final pipeline
pipeline_rfc = Pipeline(steps = [
                                ('feature_union', union),
                                ('random_forest_classifier', model_rfc)
])


In [37]:
pipeline_rfc.fit(Xp_train, yp_train)

TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '<function _logobject at 0x000001EF22D3EF70>' (type <class 'function'>) doesn't.