# Basic ML Model Deployment

## Import libraries

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

## Fetch Data

In [53]:
#data=pd.read_csv('https://raw.githubusercontent.com/tkseneee/Dataset/master/Loan_data_ver2.csv')
data=pd.read_csv('loan_dataset.csv',
                 dtype  = {'Married': str,
                     'Education': str,
                     'ApplicantIncome': 'Int64',
                     'LoanAmount': 'Int64',
                     'Credit_History': 'Int64',} )

data

Unnamed: 0,Married,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
0,No,Graduate,5849,,1,0.10
1,Yes,Graduate,4583,128,1,0.32
2,Yes,Graduate,3000,66,1,0.09
3,Yes,Not Graduate,2583,120,1,0.06
4,No,Graduate,6000,141,1,0.01
...,...,...,...,...,...,...
609,No,Graduate,2900,71,1,0.13
610,Yes,Graduate,4106,40,1,0.12
611,Yes,Graduate,8072,253,1,0.02
612,Yes,Graduate,7583,187,1,0.01


## Explore Data

In [54]:
data.shape

(614, 6)

In [55]:
data.columns

Index(['Married', 'Education', 'ApplicantIncome', 'LoanAmount',
       'Credit_History', 'Loan_Status'],
      dtype='object')

In [56]:
data.dtypes

Married             object
Education           object
ApplicantIncome      Int64
LoanAmount           Int64
Credit_History       Int64
Loan_Status        float64
dtype: object

In [57]:
data.head(2)

Unnamed: 0,Married,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
0,No,Graduate,5849,,1,0.1
1,Yes,Graduate,4583,128.0,1,0.32


In [58]:
# fetch features with missing values
data.isnull().sum()

Married             3
Education           0
ApplicantIncome     0
LoanAmount         22
Credit_History     50
Loan_Status         0
dtype: int64

3 features namely - Married,LoanAmount,Credit_History has missing values

In [59]:
data['Married'].value_counts()


Yes    398
No     213
Name: Married, dtype: int64

In [60]:
data['Education'].value_counts()

Graduate        449
Not Graduate    127
HSC              38
Name: Education, dtype: int64

In [61]:
# segreegating target & feature
X=data.drop('Loan_Status', axis=1)
y=data['Loan_Status']

In [62]:
# spliting data into train & validation set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=48)

In [63]:
# fetching numeric features list
numeric_features=list(X.select_dtypes(include=np.number).columns)
numeric_features

['ApplicantIncome', 'LoanAmount', 'Credit_History']

In [64]:
# fetching categorical features  list
categorical_features=list(X.select_dtypes(exclude=np.number).columns)
categorical_features

['Married', 'Education']

## Defining Data processing & Modeling  Pipeline

In [73]:
#  pipeline for numeric atures - missing values replacement using mean imputer follwed by StandardScaler() 
numeric_pipe=Pipeline([('mean_imputer',SimpleImputer(strategy='mean')),
                   ('std_scale',StandardScaler())])

numeric_pipe

In [74]:
# pipeline for categorical faetures - missing category replacement by new category i.e. missing followed by one hot encoding 
categorical_pipe = Pipeline([('const_imputer',SimpleImputer(strategy='constant', fill_value='Missing')), 
                      ('one_hot',(OneHotEncoder()))]) 

categorical_pipe

In [76]:
#combine data processing pipeline
data_pipeline=ColumnTransformer([('numeric',numeric_pipe,numeric_features),
                                 ('categorical',categorical_pipe, categorical_features)],
                                  remainder='passthrough')




data_pipeline

In [68]:
# adding ml-model into pipeline 
full_pipe=Pipeline([('pre_process',data_pipeline),('model',RandomForestRegressor())])

In [69]:
# training
full_pipe.fit(X_train,y_train)

In [70]:
# prediction
full_pipe.predict(X_test)

array([0.1189, 0.2244, 0.3998, 0.4958, 0.1882, 0.1059, 0.9581, 0.2778,
       0.236 , 0.2722, 0.1971, 0.9781, 0.2596, 0.1375, 0.0476, 0.1205,
       0.1441, 0.0374, 0.067 , 0.1895, 0.3189, 0.0834, 0.3627, 0.4292,
       0.5905, 0.1805, 0.2919, 0.2052, 0.2703, 0.0248, 0.107 , 0.2329,
       0.3968, 0.9465, 0.132 , 0.2666, 0.2411, 0.6149, 0.95  , 0.1819,
       0.2854, 0.1584, 0.2453, 0.3864, 0.3993, 0.382 , 0.1252, 0.1513,
       0.2235, 0.1856, 0.1074, 0.3833, 0.0865, 0.0999, 0.9619, 0.0672,
       0.425 , 0.2688, 0.265 , 0.219 , 0.3185, 0.7169, 0.2761, 0.1445,
       0.4507, 0.2284, 0.0837, 0.2888, 0.203 , 0.2156, 0.4156, 0.0889,
       0.2745, 0.3108, 0.0652, 0.8375, 0.0759, 0.3722, 0.6065, 0.0479,
       0.2651, 0.1182, 0.298 , 0.797 , 0.4125, 0.1325, 0.0851, 0.1257,
       0.0483, 0.4865, 0.0457, 0.2048, 0.2338, 0.5294, 0.4518, 0.1738,
       0.9696, 0.0878, 0.078 , 0.3347, 0.5104, 0.0669, 0.0073, 0.0519,
       0.0712, 0.1258, 0.3078, 0.2171, 0.0289, 0.1577, 0.167 , 0.0899,
      

In [71]:
## can store numeric and categorical variables also as pickle file
# pickle.dump(numeric_features,open('feat_numv1','wb'))
# pickle.dump(feat_cat,open('feat_catv1','wb'))

 

## Store the model as pickle file 

In [72]:
#pickle.dump(full_pipe,open('full_pipeline.pkl','wb'))
joblib.dump(full_pipe, 'full_pipeline1.pkl2')

['full_pipeline1.pkl2']