In [45]:
# Import librabries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator,TransformerMixin
import joblib

In [2]:
# Load dataset
dataset = pd.read_csv('loan_approval_dataset.csv')
dataset.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
# check shape of dataset
dataset.shape

(4269, 13)

In [5]:
# check number of unique from each column
dataset.nunique()

Unnamed: 0,0
loan_id,4269
no_of_dependents,6
education,2
self_employed,2
income_annum,98
loan_amount,378
loan_term,10
cibil_score,601
residential_assets_value,278
commercial_assets_value,188


In [16]:
# check missing value
dataset.isnull().sum()

Unnamed: 0,0
no_of_dependents,0
education,0
self_employed,0
income_annum,0
loan_amount,0
loan_term,0
cibil_score,0
residential_assets_value,0
commercial_assets_value,0
luxury_assets_value,0


In [7]:
# check data duplicated
dataset[dataset.duplicated()]

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status


In [8]:
# check dataset info
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


we can see there are withe space at front of column name

In [9]:
# drop column loan_id
dataset.drop(columns=['loan_id'], axis=1, inplace=True)

In [10]:
dataset.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [11]:
# Delete white space from front of each column
dataset.columns = [col.strip() for col in dataset.columns]
dataset.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [13]:
# split feature and target
X = dataset.drop(columns=['loan_status'], axis=1)
y = dataset['loan_status']

In [14]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000


In [15]:
y.head()

Unnamed: 0,loan_status
0,Approved
1,Rejected
2,Rejected
3,Rejected
4,Rejected


In [17]:
# preprocessing as per the domain knowledge
X['total_assets_value'] =  X['residential_assets_value'] + X['commercial_assets_value'] + X['luxury_assets_value'] + X['bank_asset_value']

# drop the Columns
X.drop(columns=['residential_assets_value','commercial_assets_value', 'luxury_assets_value','bank_asset_value' ],inplace=True)

In [18]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,Graduate,No,9600000,29900000,12,778,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,17000000
2,3,Graduate,No,9100000,29700000,20,506,57700000
3,3,Graduate,No,8200000,30700000,8,467,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,55000000


In [19]:
X['education'].unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [20]:
X['self_employed'].unique()

array([' No', ' Yes'], dtype=object)

In [21]:
y.unique()

array([' Approved', ' Rejected'], dtype=object)

There are white space at front of category in column category

In [22]:
# Delete white space
X['education'] = X['education'].str.strip()
X['self_employed'] = X['self_employed'].str.strip()
y = y.str.strip()

In [23]:
# Create function for separate column numeric and categoric
def separate_column_type(df):
  numerical_columns = []
  categorical_columns = []

  for col in df.columns:
    if df[col].dtype == "object":
      categorical_columns.append(col)

    else:
      numerical_columns.append(col)

  return numerical_columns, categorical_columns

In [24]:
num_cols, cat_cols = separate_column_type(X)
print(f'Numeric Columns : {num_cols}')
print(f'Categorical Columns : {cat_cols}')

Numeric Columns : ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'total_assets_value']
Categorical Columns : ['education', 'self_employed']


In [25]:
# Make funtion for Transformation to Binary
def transform_categorical_to_binary(df, columns_to_transform):
  """
  Transforms specified categorical columns in a DataFrame to binary (0 and 1).

  Args:
    df: The DataFrame to modify.
    columns_to_transform: A dictionary where keys are column names and values are
                          lists of the original categorical values to map to 1.

  Returns:
    The modified DataFrame with transformed columns.
  """

  for column_name, positive_values in columns_to_transform.items():
    df[column_name] = df[column_name].apply(lambda x: 1 if x in positive_values else 0)

  return df

In [26]:
columns_to_transform = {
  'education': ['Graduate'],
  'self_employed': ['Yes']
}

In [27]:
transformed_X = transform_categorical_to_binary(X, columns_to_transform)
transformed_X

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,9600000,29900000,12,778,50700000
1,0,0,1,4100000,12200000,8,417,17000000
2,3,1,0,9100000,29700000,20,506,57700000
3,3,1,0,8200000,30700000,8,467,52700000
4,5,0,1,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,7400000
4265,0,0,1,3300000,11300000,20,559,20000000
4266,2,0,0,6500000,23900000,18,457,39000000
4267,1,0,0,4100000,12800000,8,780,28800000


In [30]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,9600000,29900000,12,778,50700000
1,0,0,1,4100000,12200000,8,417,17000000
2,3,1,0,9100000,29700000,20,506,57700000
3,3,1,0,8200000,30700000,8,467,52700000
4,5,0,1,9800000,24200000,20,382,55000000


In [31]:
# log transformation for range each numeric colomn not far
# this is scailing with manual
log_cols = ['income_annum','loan_amount','total_assets_value']
X[log_cols] = np.log(X[log_cols])
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,16.077274,17.213369,12,778,17.741436
1,0,0,1,15.226498,16.316947,8,417,16.648724
2,3,1,0,16.023785,17.206658,20,506,17.870768
3,3,1,0,15.919645,17.239773,8,467,17.780126
4,5,0,1,16.097893,17.001863,20,382,17.822844


In [32]:
y = y.map({'Approved':1, 'Rejected':0})
y.head()

Unnamed: 0,loan_status
0,1
1,0
2,0
3,0
4,0


In [34]:
# Split X and y become X and y train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [36]:
# Build model Logistic Regression
model_log = LogisticRegression()
model_log.fit(X_train, y_train)
y_pred_test = model_log.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
acc = accuracy_score(y_test, y_pred_test)
print(f'Accuracy : {acc}')

Accuracy : 0.9047619047619048


**Reminder : this is just a simple model because in this project just focus in MLOps not in create model experiment**

# Serialization and Deserialization

- Serialization is the process to save model in form file ready to use
- Deserialization is the process that model has saved to load

In [41]:
# serialization
joblib.dump(model_log, "classification.pkl")

['classification.pkl']

In [42]:
# deserialization
final_model = joblib.load('classification.pkl')

In [43]:
final_model.intercept_, final_model.coef_

(array([-3.69980297]),
 array([[-0.02818568, -0.007182  ,  0.05058583, -3.27083322,  2.36322445,
         -0.15518387,  0.02366706,  0.25166542]]))

In [44]:
model_log.intercept_, model_log.coef_

(array([-3.69980297]),
 array([[-0.02818568, -0.007182  ,  0.05058583, -3.27083322,  2.36322445,
         -0.15518387,  0.02366706,  0.25166542]]))

# Create Custom Data Transformers

In [46]:
class AddColumns(BaseEstimator, TransformerMixin):
  def __init__(self, column_transform):
    self.column_transform = column_transform
    self.new_column = "new_column"

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    X[self.new_column] = X[self.column_transform].sum(axis=1)
    return X