In [104]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [92]:
dataset = pd.read_csv('./dataset.csv')

In [93]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [94]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [95]:
dataset.drop('Loan_ID', axis=1, inplace=True)

In [96]:
dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,381.0,381.0,381.0,370.0,351.0
mean,3579.845144,1277.275381,104.986877,340.864865,0.837607
std,1419.813818,2340.818114,28.358464,68.549257,0.369338
min,150.0,0.0,9.0,12.0,0.0
25%,2600.0,0.0,90.0,360.0,1.0
50%,3333.0,983.0,110.0,360.0,1.0
75%,4288.0,2016.0,127.0,360.0,1.0
max,9703.0,33837.0,150.0,480.0,1.0


In [97]:
dataset.isna().sum()

Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [98]:
dataset.dropna(axis=0, inplace=True)

In [99]:
categorical_columns = []
numerical_columns = []
for i in dataset.columns:
    if(dataset[i].dtype == 'object'):
        categorical_columns.append(i)
    else:
        numerical_columns.append(i)

In [100]:
for i in categorical_columns:
    print(f'{i} : {dataset[i].unique()}')

Gender : ['Male' 'Female']
Married : ['Yes' 'No']
Dependents : ['1' '0' '2' '3+']
Education : ['Graduate' 'Not Graduate']
Self_Employed : ['No' 'Yes']
Property_Area : ['Rural' 'Urban' 'Semiurban']
Loan_Status : ['N' 'Y']


In [101]:
dataset['Dependents'].replace({'0':0, '1':1, '2':2, '3+':3}, inplace=True)

In [102]:
encoder_map = {}
for i in categorical_columns:
    if(i != 'Dependents'):
        le = LabelEncoder()
        dataset[i] = le.fit_transform(dataset[i])
        encoder_map[i] = le

In [103]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, )

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
models = {
    'LogisticRegression' : LogisticRegression(max_iter=1000),
    'KNeighborsClassifier' : KNeighborsClassifier(),
    'SVC' : SVC(),
    'GaussianNB' : GaussianNB(),
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'RandomForestClassifier' : RandomForestClassifier()
}
param_grids = {
    'RandomForestClassifier': {
        'n_estimators': [100, 300, 500, 800, 1000],
        'max_depth': [10, 20, 30, None]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    },
    'LogisticRegression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'DecisionTreeClassifier': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GaussianNB': {}
}

for key, value in models.items():
    grid_search = GridSearchCV(estimator=value, param_grid=param_grids[key],cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X=X_train, y=y_train)