In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSet/loan_approved.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [None]:
df.shape

(614, 13)

In [None]:
df.drop(['Loan_ID','Dependents'], axis=1, inplace=True) # not required columns
df.shape

(614, 11)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Gender,13
Married,3
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14
Credit_History,50
Property_Area,0


In [None]:
for col in df.columns:
  if df[col].dtype == 'object':
    df[col] = df[col].fillna(df[col].mode()[0])
  else:
    df[col] = df[col].fillna(df[col].median())

In [None]:
df.isnull().sum()

Unnamed: 0,0
Gender,0
Married,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0
Property_Area,0


# Encoding (For Categorical Columns)
* Label Encoder
* OneHot Encoder
* Ordinal Encoder
* frequency/ Count Encoder

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Gender']=le.fit_transform(df['Gender']) # where covert male into 0,female into 1
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,1,No,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,1,Yes,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,1,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,1,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,1,No,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...
609,0,No,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,1,Yes,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,1,Yes,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,1,Yes,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [None]:
# OneHot Encoding
from sklearn.preprocessing import OneHotEncoder
oh = OneHotEncoder(sparse_output=False)
en_ed=oh.fit_transform(df[["Property_Area"]])
data=pd.DataFrame(en_ed,columns=["Rural","Urban","Semiurban"]) # where i want in thisorder
data

Unnamed: 0,Rural,Urban,Semiurban
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
609,1.0,0.0,0.0
610,1.0,0.0,0.0
611,0.0,0.0,1.0
612,0.0,0.0,1.0


In [None]:
df= pd.concat([df,data],axis=1) # we concatnate with original dataframe
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved),Rural,Urban,Semiurban
0,1,No,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y,0.0,0.0,1.0
1,1,Yes,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1.0,0.0,0.0
2,1,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,0.0,0.0,1.0
3,1,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,0.0,0.0,1.0
4,1,No,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,No,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,1.0,0.0,0.0
610,1,Yes,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,1.0,0.0,0.0
611,1,Yes,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,0.0,0.0,1.0
612,1,Yes,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,0.0,0.0,1.0


In [None]:
cat = [["Rural","Urban","Semiurban"]]
od = OrdinalEncoder(categories=cat)
a = od.fit_transform(df[['Property_Area']])
data = pd.DataFrame(a,columns=["Property_Area_order"])

In [None]:
df= pd.concat([df,data],axis=1)
df['Property_Area_order']=df['Property_Area_order'].astype(int)
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved),Rural,Urban,Semiurban,Property_Area_order
0,1,No,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y,0.0,0.0,1.0,1
1,1,Yes,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1.0,0.0,0.0,0
2,1,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,0.0,0.0,1.0,1
3,1,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,0.0,0.0,1.0,1
4,1,No,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,No,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,1.0,0.0,0.0,0
610,1,Yes,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,1.0,0.0,0.0,0
611,1,Yes,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,0.0,0.0,1.0,1
612,1,Yes,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,0.0,0.0,1.0,1


# ColumnTransformer

In [None]:
Data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSet/loan_approved1.csv")
Data.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,Male,No,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [None]:
Data.isnull().sum()

Unnamed: 0,0
Gender,0
Married,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0
Property_Area,0


In [None]:
numeric_co = Data.select_dtypes(include=np.number).columns
cat_co = Data.select_dtypes(include=object).columns

In [None]:
pre = ColumnTransformer(transformers = (["num",StandardScaler(),numeric_co],["cat",OneHotEncoder(sparse_output=False),cat_co]))
prep =pre.fit_transform(df)
Data_1 = pd.DataFrame(prep)
Data_1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.072991,-0.554487,-0.211241,0.273231,0.411733,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-0.134412,-0.038732,-0.211241,0.273231,0.411733,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.393747,-0.554487,-0.948996,0.273231,0.411733,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,-0.462062,0.251980,-0.306435,0.273231,0.411733,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.097728,-0.554487,-0.056551,0.273231,0.411733,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.410130,-0.554487,-0.889500,0.273231,0.411733,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
610,-0.212557,-0.554487,-1.258378,-2.522836,0.411733,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
611,0.437174,-0.472404,1.276168,0.273231,0.411733,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
612,0.357064,-0.554487,0.490816,0.273231,0.411733,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
