In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from scipy.stats import randint
from sklearn.metrics import confusion_matrix, accuracy_score
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

In [21]:
df = pd.read_csv("df_imp.csv")

In [98]:
df.head()

Unnamed: 0,Credit_History,LoanAmount,ApplicantIncome,CoapplicantIncome,Loan_Status_cat
0,1,143000,5849,0,1
1,1,128000,4583,1508,0
2,1,66000,3000,0,1
3,1,120000,2583,2358,1
4,1,141000,6000,0,1


In [22]:
X = df.iloc[:,0:4].values
y = df.iloc[:,-1].values

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [13]:
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

In [23]:
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 50)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm, rfc.score(X_test,y_test))

[[20 13]
 [14 76]] 0.7804878048780488


In [24]:
est = RandomForestClassifier(n_jobs=-1)
rf_p_dist={'max_depth':[3,5,10,None],
          'n_estimators':[100,200,300,400,500],
           'max_features':randint(1,11),
           'criterion':['gini','entropy'],
           'bootstrap':[True,False],
           'min_samples_leaf':randint(1,4),
          }

In [25]:
def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,
                              n_jobs=-1, n_iter=nbr_iter, cv=9)
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [26]:
rf_parameters, rf_ht_score = hypertuning_rscv(est, rf_p_dist, 40, X, y)

In [106]:
rf_parameters

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 2,
 'min_samples_leaf': 3,
 'n_estimators': 100}

In [107]:
rfc2 = RandomForestClassifier(bootstrap = True, n_estimators = 100, criterion = 'entropy', max_depth = 3, max_features = 2, min_samples_leaf = 3, random_state = 50)
rfc2.fit(X_train, y_train)
y_pred = rfc2.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm, rfc2.score(X_test,y_test))

[[14 19]
 [ 2 88]] 0.8292682926829268


In [109]:
ada = AdaBoostClassifier(n_estimators=300, random_state=42, learning_rate=.1)
ada.fit(X_train, y_train)
y_pred2 = ada.predict(X_test)
cm = confusion_matrix(y_test,y_pred2)
print(cm, ada.score(X_test,y_test))

[[14 19]
 [ 3 87]] 0.8211382113821138


In [110]:
gbc = GradientBoostingClassifier(n_estimators=300, random_state=42, learning_rate=.1)
gbc.fit(X_train,y_train)
y_pred3 = gbc.predict(X_test)
cm = confusion_matrix(y_test,y_pred3)
print(cm, gbc.score(X_test,y_test))

[[18 15]
 [13 77]] 0.7723577235772358


In [113]:
xgc = xgb.XGBClassifier(n_estimators=300, random_state=1, learning_rate=.01)
xgc.fit(X_train,y_train)
y_pred4 = xgc.predict(X_test)
cm = confusion_matrix(y_test,y_pred4)
print(cm, xgc.score(X_test,y_test))



[[14 19]
 [ 6 84]] 0.7967479674796748


In [6]:
lg = LogisticRegression(solver='liblinear')
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)
lg_accuracy = accuracy_score(y_test,y_pred)
print(lg_accuracy)

0.8292682926829268
