# Basic ML Model Deployment

## Import libraries

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

## Fetch Data

In [29]:
#data=pd.read_csv('https://raw.githubusercontent.com/tkseneee/Dataset/master/Loan_data_ver2.csv')
data=pd.read_csv('loan_dataset.csv',
                 dtype  = {'Married': str,
                     'Education': str,
                     'ApplicantIncome': 'Int64',
                     'LoanAmount': 'Int64',
                     'Credit_History': 'Int64',} )


## Explore Data

In [30]:
data.shape

(614, 6)

In [31]:
data.columns

Index(['Married', 'Education', 'ApplicantIncome', 'LoanAmount',
       'Credit_History', 'Loan_Status'],
      dtype='object')

In [32]:
data.dtypes

Married             object
Education           object
ApplicantIncome      Int64
LoanAmount           Int64
Credit_History       Int64
Loan_Status        float64
dtype: object

In [33]:
data.head(2)

Unnamed: 0,Married,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
0,No,Graduate,5849,,1,0.1
1,Yes,Graduate,4583,128.0,1,0.32


In [34]:
# fetch features with missing values
data.isnull().sum()

Married             3
Education           0
ApplicantIncome     0
LoanAmount         22
Credit_History     50
Loan_Status         0
dtype: int64

3 features namely - Married,LoanAmount,Credit_History has missing values

In [36]:
data['Married'].value_counts()


Married
Yes    398
No     213
Name: count, dtype: int64

In [37]:
data['Education'].value_counts()

Education
Graduate        449
Not Graduate    127
HSC              38
Name: count, dtype: int64

In [38]:
# segreegating target & feature
X=data.drop('Loan_Status', axis=1)
y=data['Loan_Status']

In [39]:
# spliting data into train & validation set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=48)

In [40]:
# fetching numeric features list
feat_num=list(X.select_dtypes(include=np.number).columns)
feat_num

['ApplicantIncome', 'LoanAmount', 'Credit_History']

In [41]:
# fetching categorical features  list
feat_cat=list(X.select_dtypes(exclude=np.number).columns)
feat_cat

['Married', 'Education']

## Defining Data processing & Modeling  Pipeline

In [21]:
#  pipeline for numeric atures - missing values replacement using k-Nearest Neighbors follwed by StandardScaler() 
num_pipe=Pipeline([('imputer',SimpleImputer(strategy='mean')),('std_scale',StandardScaler())])

num_pipe

In [22]:
# pipeline for categorical faetures - missing category replacement by new category i.e. missing followed by one hot encoding 
feat_pipe = Pipeline([('imputer',SimpleImputer(strategy='constant', fill_value='Missing')), 
                      ('one_hot',(OneHotEncoder()))]) 

feat_pipe

In [23]:
#combine data processing pipeline
data_pipeline=ColumnTransformer([('numeric',num_pipe,feat_num),
                                 ('categorical',feat_pipe, feat_cat)],
                                  remainder='passthrough')




data_pipeline

In [24]:
# adding ml-model into pipeline 
full_pipe=Pipeline([('pre_process',data_pipeline),('model',RandomForestRegressor())])

In [25]:
# training
full_pipe.fit(X_train,y_train)

In [26]:
# prediction
full_pipe.predict(X_test)

array([0.1257, 0.1853, 0.4655, 0.4361, 0.1535, 0.0936, 0.9567, 0.2322,
       0.2354, 0.2739, 0.2113, 0.9756, 0.2947, 0.1332, 0.0326, 0.1121,
       0.1228, 0.0562, 0.0551, 0.207 , 0.3152, 0.0971, 0.3992, 0.4539,
       0.5517, 0.1536, 0.2641, 0.2286, 0.2719, 0.0112, 0.1041, 0.1758,
       0.4132, 0.9479, 0.1342, 0.2726, 0.2115, 0.5717, 0.95  , 0.2039,
       0.2662, 0.1289, 0.2867, 0.3567, 0.3605, 0.3949, 0.122 , 0.1417,
       0.2588, 0.1988, 0.1233, 0.3819, 0.0743, 0.091 , 0.961 , 0.0644,
       0.4197, 0.2416, 0.2031, 0.2048, 0.3168, 0.7693, 0.272 , 0.1593,
       0.4611, 0.2409, 0.0756, 0.2807, 0.1957, 0.2059, 0.3584, 0.1077,
       0.2989, 0.2705, 0.0832, 0.8328, 0.1246, 0.4007, 0.5363, 0.0512,
       0.2469, 0.1077, 0.2608, 0.7847, 0.4507, 0.1626, 0.0913, 0.1118,
       0.0512, 0.4908, 0.0621, 0.2052, 0.2167, 0.5644, 0.428 , 0.1638,
       0.9664, 0.1054, 0.0779, 0.3627, 0.5405, 0.0733, 0.0093, 0.0466,
       0.0851, 0.1317, 0.3114, 0.2342, 0.0177, 0.1425, 0.1604, 0.1041,
      

In [None]:
## can store numeric and categorical variables also as pickle file
# pickle.dump(feat_num,open('feat_numv1','wb'))
# pickle.dump(feat_cat,open('feat_catv1','wb'))

 

## Store the model as pickle file 

In [27]:
#pickle.dump(full_pipe,open('full_pipeline.pkl','wb'))
joblib.dump(full_pipe, 'full_pipeline.pkl')

['full_pipeline.pkl']