In [470]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

In [471]:
df = pd.read_csv("/Users/karina/GitHub//mini-project-iv/data/data.csv") 
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [472]:
df['Loan_Status'] = df['Loan_Status'].replace({'Y' :1, 'N':0})

y = df.pop('Loan_Status')
X = df.drop('Loan_ID',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=32)

In [473]:
class DataframeFunctionTransformer:
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

def create_total_income_feature(input_df):
    input_df['TotalIncome'] = input_df['ApplicantIncome'] + input_df['CoapplicantIncome']
    return input_df

def to_dataframe(array):
    columns= ['Gender','Dependents','Married','Self_Employed', 'LoanAmount',
               'Loan_Amount_Term','Credit_History','Education','ApplicantIncome',
               'CoapplicantIncome','Property_Area', 'TotalIncome']
    
    return pd.DataFrame(array, columns = columns)
    
    return pd.DataFrame(array, columns = columns)

def log_object(input_df):
    input_df['LoanAmount'] = np.log(input_df['LoanAmount'])
    input_df['TotalIncome'] = np.log(input_df['TotalIncome'])
    return input_df


In [474]:
fillna_transformer = ColumnTransformer([
     ('fillna_mode', SimpleImputer(strategy="most_frequent"), ['Gender','Dependents']),
     ('fillna_no', SimpleImputer(strategy='constant', fill_value='No'), ['Married', 'Self_Employed']),
     ('fillna_mean', SimpleImputer(strategy='mean'), ['LoanAmount', 'Loan_Amount_Term']),
     ('fillna_zero', SimpleImputer(strategy='constant',fill_value=0), ['Credit_History'])],
     remainder='passthrough')
     
categorical_preprocessing = Pipeline([
    ('ohe', OneHotEncoder()),
    ])
numerical_preprocessing = Pipeline([
    ('scaling', StandardScaler())
    ])

In [475]:
preprocessing_1 = Pipeline([
    ('total_income', DataframeFunctionTransformer(create_total_income_feature)),
    ('log_transformer', DataframeFunctionTransformer(log_object)),
    ('fillna_trans', fillna_transformer),
    ('to_dataframe', DataframeFunctionTransformer(to_dataframe)),
    ])

In [476]:
categorical_preprocessing = Pipeline([
    ('ohe', OneHotEncoder()),
    ])
numerical_preprocessing = Pipeline([
    ('scaler', StandardScaler())
    ])

preprocessing_2 = ColumnTransformer([
    ('categorical_preprocessing', categorical_preprocessing, ['Gender', 'Dependents', 'Married', 'Self_Employed', 'Education', 'Property_Area']),
    ('numerical_preprocessing', numerical_preprocessing,['LoanAmount', 'Credit_History', 'Loan_Amount_Term', 'TotalIncome'])
])

In [477]:
feature_union = FeatureUnion([('pca', PCA()), 
                              ('select_best', SelectKBest())])

In [478]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC

pipeline = Pipeline([
    ('log_transformer', preprocessing_1),
    ('preprocessing', preprocessing_2),
    ('features', feature_union),
    ('clf', LogisticRegression())
])

pipeline.fit(X_train, y_train)

In [479]:
pipeline.score(X_test, y_test)

0.7513513513513513

In [480]:
param_grid = {"features__pca__n_components": [1, 2, 3, 4, 5],
                "features__select_best__k": [3, 4, 5, 6],
                'clf__penalty': ['l1','l2'],
                'clf__C': [0.001,0.01,0.1,1,10,100,1000]}

grid_search = GridSearchCV(pipeline, param_grid, verbose=5, n_jobs=-1, refit=True)    

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 280 candidates, totalling 1400 fits
[CV 2/5] END clf__C=0.001, clf__penalty=l1, features__pca__n_components=1, features__select_best__k=3;, score=nan total time=   0.0s
[CV 1/5] END clf__C=0.001, clf__penalty=l1, features__pca__n_components=1, features__select_best__k=3;, score=nan total time=   0.0s
[CV 3/5] END clf__C=0.001, clf__penalty=l1, features__pca__n_components=1, features__select_best__k=3;, score=nan total time=   0.0s
[CV 4/5] END clf__C=0.001, clf__penalty=l1, features__pca__n_components=1, features__select_best__k=4;, score=nan total time=   0.0s
[CV 1/5] END clf__C=0.001, clf__penalty=l1, features__pca__n_components=1, features__select_best__k=4;, score=nan total time=   0.0s
[CV 5/5] END clf__C=0.001, clf__penalty=l1, features__pca__n_components=1, features__select_best__k=4;, score=nan total time=   0.0s
[CV 2/5] END clf__C=0.001, clf__penalty=l1, features__pca__n_components=1, features__select_best__k=5;, score=nan total time=   0.0s
[CV 3

700 fits failed out of a total of 1400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
700 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/base_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/base_env/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/base_env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.

In [481]:
grid_search.score(X_test, y_test)

0.7675675675675676

In [482]:
grid_search.best_params_

{'clf__C': 0.1,
 'clf__penalty': 'l2',
 'features__pca__n_components': 3,
 'features__select_best__k': 3}