In [159]:
import numpy as np
import pandas as pd

In [160]:
features = pd.read_csv('features.csv')
features.head()

Unnamed: 0,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not_Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,TotalIncomeLog,LoanAmountLog,Credit_History,Loan_Status
0,0,1,1,0,1,0,1,0,0,0,1,8.674026,5.062595,1.0,1
1,0,1,0,1,1,0,1,0,1,0,0,8.714568,4.85203,1.0,0
2,0,1,0,1,1,0,0,1,0,0,1,8.006368,4.189655,1.0,1
3,0,1,0,1,0,1,1,0,0,0,1,8.505323,4.787492,1.0,1
4,0,1,1,0,1,0,1,0,0,0,1,8.699515,4.94876,1.0,1


In [161]:
X = features.drop('Loan_Status', axis=1)
y = features['Loan_Status']

In [162]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15, stratify=y)

### Random Forest Classifier

In [163]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)

In [164]:
y_pred = rfc.predict(X_test)

In [165]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred)
print(auc)

0.683223864258347


### Naive Bayes

In [166]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.8097826086956522


### Logistic Regression

In [167]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
logreg = LogisticRegression()

scaler = StandardScaler()
scaler.fit(X_train)

X_train= scaler.transform(X_train)
X_test = scaler.transform(X_test)

logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred))

Accuracy: 0.8097826086956522


In [175]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

RMSE: 0.494535


In [168]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

formula = 'Loan_Status ~ Gender_Female + Gender_Male + Married_No + Married_Yes + Education_Graduate + Education_Not_Graduate + Self_Employed_No + Self_Employed_Yes + TotalIncomeLog + LoanAmountLog + Credit_History'

model = smf.glm(formula = formula, data=features, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:            Loan_Status   No. Observations:                  611
Model:                            GLM   Df Residuals:                      603
Model Family:                Binomial   Df Model:                            7
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -288.04
Date:                Thu, 14 Jul 2022   Deviance:                       576.08
Time:                        19:38:35   Pearson chi2:                     619.
No. Iterations:                     5   Pseudo R-squ. (CS):             0.2607
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0

### SVM

In [169]:
from sklearn import svm
clf = svm.SVC(kernel='poly')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred))

Accuracy: 0.7989130434782609


### PCA And LinReg

In [170]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('pca', PCA(n_components=4)),
                           ('classifier', LogisticRegression())])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {acc}')

Test set accuracy: 0.6902173913043478


In [171]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest

feature_union = FeatureUnion([('pca', PCA()), 
                              ('select_best', SelectKBest())])

pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('features', feature_union),
                           ('classifier', RidgeClassifier())])

# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'classifier__alpha': [0.001, 0.01, 0.1], 
              'features__pca__n_components': [3, 5],
              'features__select_best__k': [1, 3, 6]}
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')

Best test set accuracy: 0.8097826086956522
Achieved with hyperparameters: {'classifier__alpha': 0.001, 'features__pca__n_components': 3, 'features__select_best__k': 1}


### Decision Trees

In [172]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred))

Accuracy: 0.6902173913043478


### XGBoost

In [218]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)
y_pred  = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.459382


In [220]:
importance =xg_reg.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.00000
Feature: 1, Score: 0.00000
Feature: 2, Score: 0.00030
Feature: 3, Score: 0.01222
Feature: 4, Score: 0.00792
Feature: 5, Score: 0.00000
Feature: 6, Score: 0.00000
Feature: 7, Score: 0.00000
Feature: 8, Score: 0.00419
Feature: 9, Score: 0.00000
Feature: 10, Score: 0.00000
Feature: 11, Score: 0.01604
Feature: 12, Score: 0.01449
Feature: 13, Score: 0.94484


In [217]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

print((cv_results["test-rmse-mean"]).tail(1))


49    0.40413
Name: test-rmse-mean, dtype: float64


In [204]:
# import XGBClassifier
from xgboost import XGBClassifier


# declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }         
           
          
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)


# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)

In [188]:
y_pred = xgb_clf.predict(X_test)
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

XGBoost model accuracy score: 0.8043


In [205]:
from sklearn.model_selection import KFold, GridSearchCV
search_space = [
  {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': range(3, 10),
    'colsample_bytree': [i/10.0 for i in range(1, 3)],
    'gamma': [i/10.0 for i in range(3)],
    'alpha': [0, 2, 4, 6, 8, 10, 12]
  }
]

kfold = KFold(n_splits=10)

In [206]:
clf = GridSearchCV(xgb_clf, scoring='accuracy', param_grid=search_space, verbose=1, cv=kfold, n_jobs=5)
clf.fit(X_train, y_train)

Fitting 10 folds for each of 4704 candidates, totalling 47040 fits


In [208]:
print(clf.best_params_)

{'alpha': 0, 'colsample_bytree': 0.1, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}


In [212]:
# import XGBClassifier
from xgboost import XGBClassifier


# declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 3,
            'alpha': 0,
            'colsample_bytree': 0.1,
            'gamma': 0.0,
            'learning_rate': 0.1,
            'n_estimators':150
        }         
           
          
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)


# fit the classifier to the training data
model = xgb_clf.fit(X_train, y_train)

In [215]:
y_pred = model.predict(X_test)
print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))

Accuracy: 0.8043478260869565


In [222]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('pca', PCA(n_components=3)),
                           ('classifier', LogisticRegression())])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {acc}')

Test set accuracy: 0.6847826086956522
