In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from scipy.stats import randint
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [3]:
df = pd.read_csv("train_categorical_cleaned.csv")

In [4]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_cat,Married_cat,Dependents_cat,Education_cat,Self_Employed_cat,Property_Area_cat,Loan_Status_cat
0,5849,0,143000,360,1,2,1,2,0,1,2,1
1,4583,1508,128000,360,1,2,2,3,0,1,0,0
2,3000,0,66000,360,1,2,2,2,0,2,2,1
3,2583,2358,120000,360,1,2,2,2,1,1,2,1
4,6000,0,141000,360,1,2,1,2,0,1,2,1


In [4]:
X = df.iloc[:,0:11].values
y = df.iloc[:,-1].values

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 50)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm, rfc.score(X_test,y_test))

[[18 15]
 [13 77]] 0.7723577235772358


In [13]:
est = RandomForestClassifier(n_jobs=-1)
rf_p_dist={'max_depth':[3,5,10,None],
          'n_estimators':[100,200,300,400,500],
           'max_features':randint(1,11),
           'criterion':['gini','entropy'],
           'bootstrap':[True,False],
           'min_samples_leaf':randint(1,4),
          }

In [14]:
def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,
                              n_jobs=-1, n_iter=nbr_iter, cv=9)
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [15]:
rf_parameters, rf_ht_score = hypertuning_rscv(est, rf_p_dist, 40, X, y)

In [16]:
rf_parameters

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 3,
 'min_samples_leaf': 2,
 'n_estimators': 400}

In [17]:
rfc3 = RandomForestClassifier(bootstrap = True, n_estimators =400, criterion = 'gini', max_depth = 3, max_features = 4, min_samples_leaf = 2, random_state = 50)
rfc3.fit(X_train, y_train)
y_pred = rfc3.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm, rfc3.score(X_test,y_test))

[[14 19]
 [ 2 88]] 0.8292682926829268


In [18]:
ada = AdaBoostClassifier(n_estimators=300, random_state=42, learning_rate=.1)
ada.fit(X_train, y_train)
y_pred2 = ada.predict(X_test)
cm = confusion_matrix(y_test,y_pred2)
print(cm, ada.score(X_test,y_test))

[[15 18]
 [ 5 85]] 0.8130081300813008


In [19]:
gbc = GradientBoostingClassifier(n_estimators=300, random_state=42, learning_rate=.1)
gbc.fit(X_train,y_train)
y_pred3 = gbc.predict(X_test)
cm = confusion_matrix(y_test,y_pred3)
print(cm, gbc.score(X_test,y_test))

[[19 14]
 [10 80]] 0.8048780487804879


In [20]:
xgc = xgb.XGBClassifier(n_estimators=300, random_state=1, learning_rate=.01)
xgc.fit(X_train,y_train)
y_pred4 = xgc.predict(X_test)
cm = confusion_matrix(y_test,y_pred4)
print(cm, xgc.score(X_test,y_test))



[[16 17]
 [ 2 88]] 0.8455284552845529


In [21]:
lg = LogisticRegression(solver='liblinear')
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)
lg_accuracy = accuracy_score(y_test,y_pred)
print(lg_accuracy)

0.8373983739837398
