In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
dataset=pd.read_csv("loan_approval_dataset.csv")
dataset.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
dataset.shape

(4269, 13)

In [4]:
# check for the unique value (`nunique` gives the number of unique values for each column)
dataset.nunique()

loan_id                      4269
 no_of_dependents               6
 education                      2
 self_employed                  2
 income_annum                  98
 loan_amount                  378
 loan_term                     10
 cibil_score                  601
 residential_assets_value     278
 commercial_assets_value      188
 luxury_assets_value          379
 bank_asset_value             146
 loan_status                    2
dtype: int64

In [5]:
# Check for duplicates
dataset.duplicated().sum()

np.int64(0)

In [6]:
dataset.drop('loan_id', axis=1, inplace=True)

In [7]:
dataset.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

> As we can see there are white spaces to all the column names except for the `loan_id` column. So Let's go ahead and remove those white spaces

In [8]:
# Remove the white spaces
dataset.columns= [c.strip() for c in dataset.columns]
dataset.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [9]:
# Split the data into features and labels
X=dataset.drop('loan_status', axis=1)
y=dataset['loan_status']

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   no_of_dependents          4269 non-null   int64 
 1   education                 4269 non-null   object
 2   self_employed             4269 non-null   object
 3   income_annum              4269 non-null   int64 
 4   loan_amount               4269 non-null   int64 
 5   loan_term                 4269 non-null   int64 
 6   cibil_score               4269 non-null   int64 
 7   residential_assets_value  4269 non-null   int64 
 8   commercial_assets_value   4269 non-null   int64 
 9   luxury_assets_value       4269 non-null   int64 
 10  bank_asset_value          4269 non-null   int64 
dtypes: int64(9), object(2)
memory usage: 367.0+ KB


In [11]:
# Apply the domain knowledge as part of preprocessing
# Combine the resendential_asset_value, commercial_asset_value, luxury_asset_value and bank_asset_value into a single column called total_assets_value as part of the business requirement

X['total_assets_value']=X['residential_assets_value']+ X['commercial_assets_value']+X['luxury_assets_value']+X['bank_asset_value']



In [12]:
X['total_assets_value']

0       50700000
1       17000000
2       57700000
3       52700000
4       55000000
          ...   
4264     7400000
4265    20000000
4266    39000000
4267    28800000
4268    77300000
Name: total_assets_value, Length: 4269, dtype: int64

In [13]:
X.drop(columns=['residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'], inplace=True)

In [14]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,Graduate,No,9600000,29900000,12,778,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,17000000
2,3,Graduate,No,9100000,29700000,20,506,57700000
3,3,Graduate,No,8200000,30700000,8,467,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,55000000


In [15]:
# Convert non-numerical values into numerical values
X['education'].unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [16]:
X['self_employed'].unique()

array([' No', ' Yes'], dtype=object)

In [17]:
y.unique()

array([' Approved', ' Rejected'], dtype=object)

> As we can see there are white spaces present on the values also so let's go ahead and remove those white spaces

In [18]:
X['education']= X['education'].str.strip()
X['self_employed']= X['self_employed'].str.strip()
y=y.str.strip()

In [19]:
X['self_employed'].unique()

array(['No', 'Yes'], dtype=object)

In [20]:
# Convert non-numerical columns into numerical columns
def transform_categorical_to_binary(df, columns_to_transform):
    """
    Transform specified categorical columns in a DataFrame to binary (0 & 1)

    Args:
       df: The DataFrame to modify
       columns_to_transform: A dictionary where keys are column names and values are
                             list of the original categorical values to map to 1.
    Returns:
        The modified DataFrame with transformed columns.
    """
    for column_name, positive_values in columns_to_transform.items():
        df[column_name]=df[column_name].apply(lambda x: 1 if x in positive_values else 0)
    return df

columns_to_transform={
    'education': ['Graduate'],
    'self_employed': ['Yes']
}

transformed_X = transform_categorical_to_binary(X, columns_to_transform)
transformed_X



Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,9600000,29900000,12,778,50700000
1,0,0,1,4100000,12200000,8,417,17000000
2,3,1,0,9100000,29700000,20,506,57700000
3,3,1,0,8200000,30700000,8,467,52700000
4,5,0,1,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,7400000
4265,0,0,1,3300000,11300000,20,559,20000000
4266,2,0,0,6500000,23900000,18,457,39000000
4267,1,0,0,4100000,12800000,8,780,28800000


In [21]:
# Apply log transformation to scale values
log_cols= ['income_annum', 'loan_amount', 'total_assets_value']
X[log_cols]=np.log(X[log_cols])

In [22]:
y=y.map({"Approved":1, "Rejected":0})

In [23]:
y[:5]

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [24]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,16.077274,17.213369,12,778,17.741436
1,0,0,1,15.226498,16.316947,8,417,16.648724
2,3,1,0,16.023785,17.206658,20,506,17.870768
3,3,1,0,15.919645,17.239773,8,467,17.780126
4,5,0,1,16.097893,17.001863,20,382,17.822844


In [25]:
y.head()

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [26]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0)



In [27]:
# Build the model
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# make predictions
y_pred_test= log.predict(X_test)


In [29]:
# Check the results
from sklearn.metrics import accuracy_score
acc= accuracy_score(y_test, y_pred_test)
print(f"Accuracy is {acc}")

Accuracy is 0.9047619047619048


## Serialization & Deserialization

In [30]:
import joblib
joblib.dump(log, "loan_trained_model_v1.pkl")

['loan_trained_model_v1.pkl']

In [31]:
# Deserialization
final_model=joblib.load("loan_trained_model_v1.pkl")

In [32]:
final_model.intercept_, final_model.coef_

(array([-3.82022286]),
 array([[-0.04530185, -0.05854611,  0.03226989, -3.3298224 ,  2.41439262,
         -0.15778844,  0.02388047,  0.26190246]]))

In [33]:
log.intercept_, log.coef_

(array([-3.82022286]),
 array([[-0.04530185, -0.05854611,  0.03226989, -3.3298224 ,  2.41439262,
         -0.15778844,  0.02388047,  0.26190246]]))

## Packages and Modules

In [34]:
import PackageA

In [35]:
from PackageA import f1

In [36]:
from PackageA.SubPackageA import f3

In [37]:
f3.print_something()

'output from subPackage A f3'

In [38]:
from PackageA.f1 import print_something as f1p

In [39]:
f1p()

'output from f1'

In [40]:
import sys

In [41]:
sys.path

['/Users/sumi/miniforge3/envs/MLOps/lib/python313.zip',
 '/Users/sumi/miniforge3/envs/MLOps/lib/python3.13',
 '/Users/sumi/miniforge3/envs/MLOps/lib/python3.13/lib-dynload',
 '',
 '/Users/sumi/miniforge3/envs/MLOps/lib/python3.13/site-packages']

In [None]:
# If not in same directory
sys.path.append('path to the directory') # mac or linux for wind c:\\path to dir

In [1]:
import pandas

In [2]:
pandas.__version__

'2.2.3'

In [3]:
import numpy
numpy.__version__

'2.1.2'

In [4]:
import joblib
joblib.__version__

'1.4.2'

In [5]:
import sklearn
sklearn.__version__

'1.5.2'

In [6]:
import scipy
scipy.__version__

'1.14.1'

In [7]:
import setuptools
setuptools.__version__

'75.3.0'

In [8]:
import wheel
wheel.__version__

'0.43.0'