## 1) Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os

In [None]:
#changing the working directory
os.chdir("C://Users//HP//OneDrive//Desktop//Machine learning")

In [None]:
# importing the dataset
df=pd.read_csv("train_loan_data.csv")

# Data Preparation
 1.1)Understainding the data <br>
 1.2)Descriptive Statistics <br>
 1.3)Exploratory Data Analysis <br>
 1.4)Outlier Treatment <br>
 1.5)Missing value Treatment <br>
 1.6)Encoding <br>
 1.7)Splitting the data into Train and Test

## 1.1) Understanding the Data

In [None]:
# first five rows of the dataset
df.head()

In [None]:
#length of the dataframe
len(df)

In [None]:
# shape of the dataframe
df.shape

In [None]:
# No. of columns in dataframe
df.columns

In [None]:
# Count of total no. of columns
len(df.columns)

In [None]:
#No. of duplicate values
df.duplicated().sum()

## 1.2) Descriptive Statistics

In [None]:
# Missing values and dtypes of columns
df.info()

##### Columns such as  
emp_length, emp_title, num_actv_bc_tl, mort_acc, tot_cur_bal, pub_rec_bankruptcies, revol_util, title <br> 
Have some missing Values

In [None]:
# Statistic analysis
df.describe()

##### Outliers are present in the dataset

In [None]:
# object columns
df.describe(include="O")

## 1.3) Exploratory Data Analysis

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="annual_inc",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="fico_range_high",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="fico_range_low",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="int_rate",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="loan_amnt",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="num_actv_bc_tl",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="mort_acc",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="tot_cur_bal",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="open_acc",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="pub_rec",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="pub_rec_bankruptcies",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="revol_bal",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="revol_util",y="loan_status")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=df,x="total_acc",y="loan_status")
plt.show()

## 1.4) Outlier Treatment

In [None]:
def outlier_treatment(dataframe):
    Q1=dataframe.quantile(0.25)
    Q3=dataframe.quantile(0.75)
    IQR=Q3-Q1

    df_rem_out=dataframe[~((dataframe<(Q1-1.5*IQR))|(dataframe>(Q3+1.5*IQR))).any(axis=1)]
    
    return df_rem_out
 
df=outlier_treatment(df)
                           

In [None]:
len(df)

## 1.6) Encoding

In [None]:
# Separating the numerical and categorical columns
def data_type(df):
    numerical = []
    categorical = []
    for i in df.columns:
        if df[i].dtype == 'int64' or df[i].dtype == 'float64':
            numerical.append(i)
        else:
            categorical.append(i)
    return numerical, categorical


numerical, categorical = data_type(df)

# Identifying the binary columns and ignoring them from scaling
def binary_columns(df):
    binary_cols = []
    for col in df.select_dtypes(include=['int', 'float']).columns:
        unique_values = df[col].unique()
        if np.in1d(unique_values, [0, 1]).all():
            binary_cols.append(col)
    return binary_cols

binary_cols = binary_columns(df)

# Remove the binary columns from the numerical columns
numerical = [i for i in numerical if i not in binary_cols]

def encoding(df, categorical):
    for i in categorical:
        df[i] = df[i].astype('category')
        df[i] = df[i].cat.codes
    return df

df = encoding(df, categorical)

## 1.5) Missing value Treatment

In [None]:
df.isnull().sum()/len(df)*100

In [None]:
def missing_value_imputation(df, numerical, categorical):
    for i in numerical:
        df[i]= df[i].fillna(df[i].mean())
    for i in categorical:
        df[i] = df[i].fillna(df[i].mode())
    return df

df = missing_value_imputation(df, numerical, categorical)

In [None]:
# checking the missing values
df.isnull().sum()

### Feature scaling

In [None]:
## feature scaling 
from sklearn.preprocessing import StandardScaler

def feature_scaling(df, numerical):
    sc_x = StandardScaler()
    df[numerical] = sc_x.fit_transform(df[numerical])
    return df

df = feature_scaling(df, numerical)

## 1.7)Splitting the data

In [None]:
X=df.iloc[:,range(0,26)].values

In [None]:
X

In [None]:
y=df.iloc[:,27].values
y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# 3)Model Building

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic=LogisticRegression()

In [None]:
logistic.fit(X_train,y_train)

In [None]:
prediction=logistic.predict(X_test)

In [None]:
prediction

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix

# confusion matrix
confusion_matrix=confusion_matrix(y_test,prediction)
confusion_matrix

In [None]:
# calculating the accuracy
accuracy_score=accuracy_score(y_test,prediction)
accuracy_score

## Conclusion 
###### Accuracy of model is 80.01%

## Xgboost Model

In [None]:
%pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
 xgb_cal=xgb.XGBClassifier( n_estimators = 10)

In [None]:
xgb_cal

In [None]:
# Fit and predict from the model
xgb_cal.fit(X_train,y_train)

preds = xgb_cal.predict(X_test)

In [None]:
from sklearn import metrics
accuracyList=[]
modelList=[]
# print the accuracy
print("Accuracy:",metrics.accuracy_score(y_test, preds))
accuracyList.append(metrics.accuracy_score(y_test, preds))
modelList.append("XGBoost")

## Conclusion
###### Accuracy of the Model is 80.23%

## Support Vector Classification(SVC)

In [None]:
from sklearn.svm import SVC
model=SVC()


In [None]:
# fitting the model
model.fit(X_train,y_train)

In [None]:
#prediction
predict=model.predict(X_test)
predict

In [None]:
#calculating the accuracy
svc_accuracy_score=accuracy_score(y_test,predict)
svc_accuracy_score


## Conclusion
###### Accuracy of Model is 76.20%