In [1]:
#importing the necessary library
import pandas as pd

In [2]:
# loading the train and test dataset
path = 'https://raw.githubusercontent.com/subashgandyer/datasets/main/loan_train.csv'

In [3]:
# Now reading the data
data = pd.read_csv(path)

In [4]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# NOW CLEANING UP THE DATASET

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
#Taking care of missing values
# since there are nan values we have to handle each accordingly
data['Gender'].mode() #since the mode is male, we fill the nan values with male
data['Gender'].fillna('male',inplace=True)

# FOR Married
data['Married'].mode()
data['Married'].fillna('Yes',inplace=True)

# FOR Self_Employed
data['Self_Employed'].mode()
data['Self_Employed'].fillna('Yes',inplace=True)

data['Dependents'] = data['Dependents'].replace(['3+'],3)
data['Dependents'].fillna(0,inplace=True)
data['Dependents'].fillna(0,inplace=True)

# noow filling all missing numerical values 
data.fillna(data.mean(),inplace=True)

#The above approach is choosen because the missing values are less than 5%

In [8]:
# comfirming all missing values are taken care of
data.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# NOW CONVERTING ALL CATEGORICAL DATA TO NUMERICAL FOR THE DATASET

In [9]:
# COVERTING CATEGORICAL DATA IN DATASET
# GENDER, MARIED, EDUCATION,SELF-EMPLOYED - I will employ label encoder because it has only two categories
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()

In [10]:
#FOR GENDER
data.Gender = le.fit_transform(data.Gender)

#FOR Married
data.Married = le.fit_transform(data.Married)

#FOR Self_Employed
data.Self_Employed = le.fit_transform(data.Self_Employed)

#FOR Education
data.Education = le.fit_transform(data.Education)
                                  
#FOR Loan_Status
data.Loan_Status = le.fit_transform(data.Loan_Status)

In [11]:
#NOW SINCE PROPERTTY_AREA FEATURE HAS HAS MORE THAN TWO CATEGORIES, I EMPLOY ONEHOTENCODER
ohe = OneHotEncoder()
area = ohe.fit_transform(data[['Property_Area']]).toarray()
a_frame = pd.DataFrame(area)


# creating a new dataframe to include the transformed data
data_new = pd.concat([data,a_frame],axis=1)

In [12]:
# CONFIRMING THAT TEST DATASET IS 100% PROCESSED
data_new

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,0,1,2
0,LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,Urban,1,0.0,0.0,1.0
1,LP001003,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,Rural,0,1.0,0.0,0.0
2,LP001005,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
3,LP001006,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
4,LP001008,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,Rural,1,1.0,0.0,0.0
610,LP002979,1,1,3,0,0,4106,0.0,40.000000,180.0,1.0,Rural,1,1.0,0.0,0.0
611,LP002983,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
612,LP002984,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,Urban,1,0.0,0.0,1.0


# NOW SPLITTING DATA INTO DEPENDENT AND INDEPENDENT VARIABLES

In [13]:
X = data_new.loc[:,['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History',0,1,2]]
y = data['Loan_Status']

In [14]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,0,1,2
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,0.0,0.0,1.0
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,1.0,0.0,0.0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,0.0,0.0,1.0
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,0.0,0.0,1.0
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,1.0,0.0,0.0
610,1,1,3,0,0,4106,0.0,40.000000,180.0,1.0,1.0,0.0,0.0
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,0.0,0.0,1.0
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,0.0,0.0,1.0


In [15]:
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64

### Split the dataset into train and test data

In [16]:
# importing model selection from skit learn
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X, y,train_size=0.5,test_size=0.5,random_state=123) 

# NOW USING THE DECISION TREE ML ALGORITHM 

In [17]:
#importing the decision tree classification model
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='gini',max_depth=None)

In [18]:
tree.fit(train_X,train_y)

DecisionTreeClassifier()

In [19]:
prediction = tree.predict(test_X)

In [20]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_y,prediction))

[[ 53  50]
 [ 41 163]]


# Now Using GridSearchCV for finding the best model with the best hyperparameters

In [21]:
param_dist = {'criterion': ['gini','entropy'], 'max_depth': [1,2,3,4,5,6,7,None]} 

In [22]:
from sklearn.model_selection import GridSearchCV 

In [23]:
#Creating Parameter Grid
grid = GridSearchCV(tree, param_grid=param_dist, cv=10, n_jobs=-1) 

In [24]:
#Running gridSearchVC
grid.fit(train_X,train_y)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, None]})

In [25]:
grid.best_params_  # Parameter setting that gave the best results

{'criterion': 'gini', 'max_depth': 1}

In [26]:
grid.best_estimator_  # Best hyperparameter accuracy for the algorithm algorithm

DecisionTreeClassifier(max_depth=1)

In [27]:
grid_predictions = grid.predict(test_X)

In [28]:
#best algorithm accuracy
print(confusion_matrix(test_y,grid_predictions),'\n with accuracy ',grid.best_score_)

[[ 43  60]
 [  4 200]] 
 with accuracy  0.8273118279569893


# NOW USING THE KNN ML ALGORITHM 

In [29]:
# Now looping through the preprocessing techniques with KNN
from sklearn.neighbors import KNeighborsClassifier

Knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [30]:
Knn.fit(train_X,train_y)

KNeighborsClassifier()

In [31]:
K_prediction = Knn.predict(test_X)

In [32]:
print(confusion_matrix(test_y,K_prediction))

[[ 12  91]
 [ 29 175]]


# Now Using GridSearchCV for finding the best model with the best hyperparameters

In [33]:
param_distK = {'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']} 

In [34]:
#Creating Parameter Grid
K_grid = GridSearchCV(KNeighborsClassifier(), param_distK, verbose = 1, cv=3, n_jobs = -1)

In [35]:
#Running gridSearchVC
K_grid.fit(train_X,train_y)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:    0.5s finished


GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': [5, 7, 9, 11, 13, 15],
                         'weights': ['uniform', 'distance']},
             verbose=1)

In [36]:
K_grid.best_params_  # Parameter setting that gave the best results

{'metric': 'minkowski', 'n_neighbors': 11, 'weights': 'uniform'}

In [37]:
K_grid.best_estimator_  # Best hyperparameter accuracy for the algorithm algorithm

KNeighborsClassifier(n_neighbors=11)

In [38]:
grid_predictionsK = K_grid.predict(test_X)

In [39]:
print(confusion_matrix(test_y,grid_predictionsK),'\n with accuracy ',K_grid.best_score_)

[[ 10  93]
 [ 18 186]] 
 with accuracy  0.7426867187004252


# NOW USING THE LOGISTIC REGRESSION ML ALGORITHM 

In [40]:
# Now looping through the preprocessing techniques with KNN
import numpy as np
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression()

In [41]:
logReg.fit(train_X,train_y)

LogisticRegression()

In [42]:
Reg_prediction = logReg.predict(test_X)

In [43]:
print(confusion_matrix(test_y,Reg_prediction))

[[ 44  59]
 [  4 200]]


# Now Using GridSearchCV for finding the best model with the best hyperparameters

In [44]:
param_dist_reg = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }

In [45]:
#Creating Parameter Grid
reg_grid = GridSearchCV(logReg, param_grid = param_dist_reg, verbose = 1, cv=3, n_jobs = -1)

In [46]:
#Running gridSearchVC
reg_grid.fit(train_X,train_y)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 2680 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 3808 tasks      | elapsed:   51.3s
[Parallel(n_jobs=-1)]: Done 4793 out of 4800 | elapsed:  1.1min remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed:  1.1min finished


GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                         'max_iter': [100, 1000, 2500, 5000],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag',
                                    'saga']},
             verbose=1)

In [47]:
reg_grid.best_params_  # Parameter setting that gave the best results

{'C': 0.0001, 'max_iter': 1000, 'penalty': 'none', 'solver': 'lbfgs'}

In [48]:
reg_grid.best_estimator_  # Best hyperparameter accuracy for the algorithm algorithm

LogisticRegression(C=0.0001, max_iter=1000, penalty='none')

In [49]:
grid_predictionsReg = reg_grid.predict(test_X)

In [50]:
print(confusion_matrix(test_y,grid_predictionsReg),'\n with accuracy ',reg_grid.best_score_)

[[ 45  58]
 [  4 200]] 
 with accuracy  0.824068786090488


# NOW USING THE SVM ML ALGORITHM 

In [51]:
from sklearn.svm import SVC
svm_model = SVC(C= 1, gamma= 0.01, kernel= 'rbf', random_state=42)

In [52]:
svm_model.fit(train_X,train_y)

SVC(C=1, gamma=0.01, random_state=42)

In [53]:
prediction_svm = svm_model.predict(test_X)

In [54]:
print(confusion_matrix(test_y,prediction_svm))

[[  0 103]
 [  0 204]]


# Now Using GridSearchCV for finding the best model with the best hyperparameters

In [57]:
param_grid_svm = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [58]:
#Creating Parameter Grid
grid_svm = GridSearchCV(SVC(),param_grid_svm,refit=True,verbose=3)

In [59]:
#Running gridSearchVC
grid_svm.fit(train_X,train_y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.710, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.710, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.721, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.705, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.705, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.710, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.705, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.705, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.710, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.710, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.721, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.705, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.705, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] .

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:    1.0s finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [60]:
grid_svm.best_params_  # Parameter setting that gave the best results

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

In [61]:
grid_svm.best_estimator_  # Best hyperparameter accuracy for the algorithm algorithm

SVC(C=100, gamma=0.001)

In [62]:
grid_prediction_svm = grid.predict(test_X)

In [63]:
print(confusion_matrix(test_y,grid_prediction_svm),'\n with accuracy ',grid_svm.best_score_)

[[ 43  60]
 [  4 200]] 
 with accuracy  0.713379164463247


# NOW USING THE RANDOM FOREST ML ALGORITHM 

In [64]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state =0, n_estimators=100)

In [65]:
rf_model.fit(train_X,train_y)

RandomForestClassifier(random_state=0)

In [66]:
prediction_rf = rf_model.predict(test_X)

In [67]:
print(confusion_matrix(test_y,prediction_rf))

[[ 43  60]
 [ 11 193]]


# Now Using GridSearchCV for finding the best model with the best hyperparameters

In [74]:
param_grid_rf = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],

 'n_estimators': [200, 400, 600, 800, 1000]}

In [75]:
#Creating Parameter Grid
grid_rf = GridSearchCV(estimator = rf_model, param_grid = param_grid_rf,  cv = 3, verbose=2)

In [76]:
#Running gridSearchVC
grid_rf.fit(train_X,train_y)

Fitting 3 folds for each of 110 candidates, totalling 330 fits
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... bootstrap=True, max_depth=10, n_estimators=200, total=   0.3s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ... bootstrap=True, max_depth=10, n_estimators=200, total=   0.2s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=200, total=   0.2s
[CV] bootstrap=True, max_depth=10, n_estimators=400 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=400, total=   0.4s
[CV] bootstrap=True, max_depth=10, n_estimators=400 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=400, total=   0.4s
[CV] bootstrap=True, max_depth=10, n_estimators=400 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=400, total=   0.5s
[CV] bootstrap=True, max_depth=10, n_estimators=600 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=600, total=   0.7s
[CV] bootstrap=True, max_depth=10, n_estimators=600 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=600, total=   0.6s
[CV] bootstrap=True, max_depth=10, n_estimators=600 ..................
[CV] .

[Parallel(n_jobs=1)]: Done 330 out of 330 | elapsed:  3.6min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0),
             param_grid={'bootstrap': [True, False],
                         'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
                                       None],
                         'n_estimators': [200, 400, 600, 800, 1000]},
             verbose=2)

In [77]:
grid_rf.best_params_  # Parameter setting that gave the best results

{'bootstrap': True, 'max_depth': 20, 'n_estimators': 200}

In [78]:
grid_rf.best_estimator_  # Best hyperparameter accuracy for the algorithm algorithm

RandomForestClassifier(max_depth=20, n_estimators=200, random_state=0)

In [97]:
grid_prediction_rf = grid_rf.predict(test_X)

In [98]:
print(confusion_matrix(test_y,grid_prediction_rf),'\n with accuracy ',grid_rf.best_score_)

[[ 43  60]
 [  8 196]] 
 with accuracy  0.820864268037312


# NOW USING My OWN CHOICE ALGORITHM (THE SDGCLASSIFIER) ML ALGORITHM 

In [99]:
from sklearn.linear_model import SGDClassifier

SGD = SGDClassifier(random_state=0, class_weight='balanced')

In [100]:
SGD.fit(train_X,train_y)

SGDClassifier(class_weight='balanced', random_state=0)

In [101]:
prediction_SDG = SGD.predict(test_X)

In [102]:
print(confusion_matrix(test_y,prediction_SDG))

[[100   3]
 [201   3]]


# Now Using GridSearchCV for finding the best model with the best hyperparameters

In [103]:
param_grid_SDG = {'alpha': [10 ** x for x in range(-6, 1)],
    'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 1],}

In [104]:
#Creating Parameter Grid
grid_SDG = GridSearchCV(estimator=SGD, param_grid=param_grid_SDG,
                                    n_jobs=-1, scoring='roc_auc')

In [105]:
#Running gridSearchVC
grid_SDG.fit(train_X,train_y)

GridSearchCV(estimator=SGDClassifier(class_weight='balanced', random_state=0),
             n_jobs=-1,
             param_grid={'alpha': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                         'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95,
                                      1]},
             scoring='roc_auc')

In [106]:
grid_SDG.best_params_  # Parameter setting that gave the best results

{'alpha': 1e-06, 'l1_ratio': 0}

In [107]:
grid_SDG.best_estimator_  # Best hyperparameter accuracy for the algorithm algorithm

SGDClassifier(alpha=1e-06, class_weight='balanced', l1_ratio=0, random_state=0)

In [108]:
grid_prediction_SDG = grid_SDG.predict(test_X)

In [109]:
print(confusion_matrix(test_y,grid_prediction_SDG),'\n with accuracy ',grid_SDG.best_score_)

[[ 91  12]
 [171  33]] 
 with accuracy  0.5631803672843345
