In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("loan_application.csv")

In [3]:
df.head()

Unnamed: 0,Application_ID,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,LP001002,Male,No,0,Graduate,No,1,Urban,medium,Y
1,LP001003,Male,Yes,1,Graduate,No,1,Rural,medium,N
2,LP001005,Male,Yes,0,Graduate,Yes,1,Urban,low,Y
3,LP001006,Male,Yes,0,Not Graduate,No,1,Urban,low,Y
4,LP001008,Male,No,0,Graduate,No,1,Urban,medium,Y


In [4]:
def relevant_data(data_df):
    """
    Returns only the required columns from the data set
    :param data_df: raw pandas data frame
    :return: pandas data frame with relevant columns
    """
    data_df = data_df.drop('Application_ID', axis=1)
    return data_df
def cat2int(data_df):
    """
    Converts categorical values in to discret numeric values
    :param data_df: raw data frame
    :return: data frame with categorical converted to numerics
    """
    data_df['Dependents'] = data_df['Dependents'].map(
        lambda x: 4 if x == '3+' else int(x))
    data_df['Gender'] = data_df['Gender'].map(lambda x: 0 if x == 'No' else 1)
    data_df['Education'] = data_df['Education'].map(
        lambda x: 0 if x == 'Not Graduate' else 1)
    data_df['Married'] = data_df['Married'].map(
        lambda x: 0 if x == 'No' else 1)
    data_df['Property_Area'] = data_df['Property_Area'].map(
        lambda x: 0 if x == 'Urban' else 1 if x == 'Semiurban' else 2)
    data_df['Income'] = data_df['Income'].map(
        lambda x: 0 if x == 'low' else 1 if x == 'medium' else 2)
    data_df['Self_Employed'] = data_df['Self_Employed'].map(
        lambda x: 0 if x == 'No' else 1)
    return data_df
def get_x_y(data_df):
    """
    Returns X and y i.e. predictors and target variale from data set
    :param data_df: raw data frame
    :return: 2 pandas data frames
    """
    X = data_df.drop('Application_Status', axis=1)
    y = data_df.loc[:, 'Application_Status']
    return X, y

In [5]:
df = relevant_data(df)
df = cat2int(df)

In [6]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,1,0,0,1,0,1,0,1,Y
1,1,1,1,1,0,1,2,1,N
2,1,1,0,1,1,1,0,0,Y
3,1,1,0,0,0,1,0,0,Y
4,1,0,0,1,0,1,0,1,Y


In [8]:
df.drop(columns=["Gender","Dependents","Self_Employed","Property_Area"], inplace=True)

In [9]:
df.head()

Unnamed: 0,Married,Education,Credit_History,Income,Application_Status
0,0,1,1,1,Y
1,1,1,1,1,N
2,1,1,1,0,Y
3,1,0,1,0,Y
4,0,1,1,1,Y


In [10]:
df.Application_Status = df.Application_Status.map(lambda x: 1 if x=="Y" else 0)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   Married             511 non-null    int64
 1   Education           511 non-null    int64
 2   Credit_History      511 non-null    int64
 3   Income              511 non-null    int64
 4   Application_Status  511 non-null    int64
dtypes: int64(5)
memory usage: 20.1 KB


In [12]:
X, y = get_x_y(df)

In [13]:
X.head()

Unnamed: 0,Married,Education,Credit_History,Income
0,0,1,1,1
1,1,1,1,1
2,1,1,1,0
3,1,0,1,0
4,0,1,1,1


In [18]:
df.columns

Index(['Married', 'Education', 'Credit_History', 'Income',
       'Application_Status'],
      dtype='object')

In [19]:
df.columns = ['Married', 'Education', 'Credit_History', 'Income','target_col']

In [20]:
df.head()

Unnamed: 0,Married,Education,Credit_History,Income,target_col
0,0,1,1,1,1
1,1,1,1,1,0
2,1,1,1,0,1
3,1,0,1,0,1
4,0,1,1,1,1


In [21]:
import sys
sys.path.append("/home/harshal/greyatom/functions/")
from clas import clas

In [22]:
clas(df)

 
LogisticRegression
 
              precision    recall  f1-score   support

           0       0.91      0.49      0.64        43
           1       0.83      0.98      0.90       111

    accuracy                           0.84       154
   macro avg       0.87      0.74      0.77       154
weighted avg       0.85      0.84      0.83       154

--------------------------------------------------
 
RandomForestClassifier
 
              precision    recall  f1-score   support

           0       0.88      0.51      0.65        43
           1       0.84      0.97      0.90       111

    accuracy                           0.84       154
   macro avg       0.86      0.74      0.77       154
weighted avg       0.85      0.84      0.83       154

--------------------------------------------------
 
DecisionTreeClassifier
 
              precision    recall  f1-score   support

           0       0.88      0.51      0.65        43
           1       0.84      0.97      0.90       111

   

Unnamed: 0,LogisticRegression,RandomForestClassifier,DecisionTreeClassifier,ExtraTreeClassifier,ExtraTreesClassifier,GradientBoostingClassifier
precision,0.83,0.84,0.84,0.84,0.84,0.84
recall,0.98,0.97,0.97,0.97,0.97,0.97


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [25]:
train = pd.concat([X_train,y_train], axis = 1)

In [26]:
train.head()

Unnamed: 0,Married,Education,Credit_History,Income,Application_Status
485,1,1,1,0,1
310,1,1,1,1,1
297,1,1,1,0,0
71,0,1,1,0,1
170,0,1,0,1,1


In [27]:
test = pd.concat([X_test, y_test],axis = 1)

In [28]:
train.to_csv("train.csv")
test.to_csv("test.csv")

In [29]:
model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [30]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.41      0.56        37
           1       0.74      0.97      0.84        66

    accuracy                           0.77       103
   macro avg       0.81      0.69      0.70       103
weighted avg       0.79      0.77      0.74       103



In [33]:
import pickle 
pickle_out = open("model.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close()