In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

In [2]:
data_all = pd.read_csv('loanprediction.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)
data_all.head(5)

Unnamed: 0_level_0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [3]:
data_all.columns = ['income', 'age', 'experience', 'marital_status', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_yrs',
       'current_house_yrs', 'risk_flag']
numerical = ["income", "age", "experience", "current_job_yrs", "current_house_yrs"]
categorical = ["marital_status", "house_ownership", "car_ownership", "profession", "city", "state"] 

In [4]:
for col in categorical:
    data_all = pd.concat([data_all, pd.get_dummies(data_all[col], prefix = col+'_=')], axis = 1).drop(col, axis = 1)

In [5]:
from sklearn.preprocessing import StandardScaler

for col in numerical:
    data_all[col] = StandardScaler().fit_transform(data_all[col].values.reshape(-1, 1))
data_all.head()

Unnamed: 0_level_0,income,age,experience,current_job_yrs,current_house_yrs,risk_flag,marital_status_=_married,marital_status_=_single,house_ownership_=_norent_noown,house_ownership_=_owned,...,state_=_Punjab,state_=_Rajasthan,state_=_Sikkim,state_=_Tamil_Nadu,state_=_Telangana,state_=_Tripura,state_=_Uttar_Pradesh,state_=_Uttar_Pradesh[5],state_=_Uttarakhand,state_=_West_Bengal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.283145,-1.579604,-1.180232,-0.914131,0.716356,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.895457,-0.583344,-0.014067,0.731036,0.716356,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.349269,0.940348,-1.013637,-0.639936,-1.427981,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.437526,-0.52474,-1.346827,-1.188325,0.001577,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.268128,-0.173119,0.152528,-0.914131,1.431135,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [6]:
X = data_all.drop("risk_flag", axis=1)
y = data_all["risk_flag"]
print('Shape of x: {}'.format(x.shape))
print('Shape of y: {}'.format(y.shape))

NameError: name 'x' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)

In [None]:
smote = SMOTE(random_state = 11)
X_train, y_train = smote.fit_resample(X_train, y_train)
pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
                             ['classifier', LogisticRegression(random_state=11,
                                                               max_iter=1000)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')