In [42]:
# Import the required libraries: Pandas, Numpy, Matplotlib and   Seaborn

# modules we'll use
import pandas as pd
import numpy as np

# for Box-Cox Transformation
from scipy import stats

# for min_max scaling
from sklearn import preprocessing

# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt


import warnings                   # To ignore the warnings
warnings.filterwarnings("ignore")

# Today, we're going to be looking at how to scale and normalize data 

In [5]:
# set seed for reproducibility
np.random.seed(0)

In [48]:
# Pick Subset of the first 10  rows in the data set


subset_a=df.iloc[:10:]
subset_a

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,1,No,5849,0.0,,360.0,2,Urban,Y
1,LP001003,Male,Yes,1,1,No,4583,1508.0,3.444284,360.0,2,Rural,N
2,LP001005,Male,Yes,0,1,Yes,3000,0.0,1.649783,360.0,2,Urban,Y
3,LP001006,Male,Yes,0,0,No,2583,2358.0,3.212735,360.0,2,Urban,Y
4,LP001008,Male,No,0,1,No,6000,0.0,3.82055,360.0,2,Urban,Y
5,LP001011,Male,Yes,2,1,Yes,5417,4196.0,7.467438,360.0,2,Urban,Y
6,LP001013,Male,Yes,0,0,No,2333,1516.0,2.489146,360.0,2,Urban,Y
7,LP001014,Male,Yes,3+,1,No,3036,2504.0,4.31259,360.0,1,Semiurban,N
8,LP001018,Male,Yes,2,1,No,4006,1526.0,4.602026,360.0,2,Urban,Y
9,LP001020,Male,Yes,1,1,No,12841,10968.0,9.84081,360.0,2,Semiurban,N


Scaling
This means that you're transforming your data so that it fits within a specific scale, like 0-100 or 0-1.

In [6]:
# Let usload and read the data from the csv file 

df=pd.read_csv("train.csv",encoding = "ISO-8859-1")
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [15]:
df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,2,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,2,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,2,Urban,Y


In [16]:

# First map the string values of Education to integer.

def mapping(df,feature):
    featureMap=dict()
    count=0
    for i in sorted(df[feature].unique(),reverse=True):
        featureMap[i]=count
        count=count+1
    df[feature]=df[feature].map(featureMap)
    return df

In [17]:

x=mapping(df,"Education")
x.sample(4)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
433,LP002387,Male,Yes,0,1,No,2425,2340.0,143.0,360.0,2,Semiurban,Y
371,LP002197,Male,Yes,2,1,No,5185,0.0,155.0,360.0,2,Semiurban,Y
321,LP002053,Male,Yes,3+,1,No,4342,189.0,124.0,360.0,2,Semiurban,Y
74,LP001253,Male,Yes,3+,1,Yes,5266,1774.0,187.0,360.0,2,Semiurban,Y


In [14]:
# Drop columns Education and  Credit History

y=df.drop(["Education","Credit_History"],axis=1)
y.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Property_Area,Loan_Status
233,LP001776,Female,No,0,No,8333,0.0,280.0,360.0,Semiurban,Y
379,LP002225,Male,Yes,2,No,5391,0.0,130.0,360.0,Urban,Y
320,LP002051,Male,Yes,0,No,2400,2167.0,115.0,360.0,Semiurban,Y
83,LP001273,Male,Yes,0,No,6000,2250.0,265.0,360.0,Semiurban,N
449,LP002444,Male,No,1,Yes,2769,1542.0,190.0,360.0,Semiurban,N


In [21]:
# Normalizing just a single column

# Let's normalize Loan Amount in the range 0 to 20



a=df["LoanAmount"]=((df["LoanAmount"]-df["LoanAmount"].min())/(df["LoanAmount"].max()-df["LoanAmount"].min()))*20

In [24]:
#  Normalizing  Data Frame

# dataf=((df-df.min())/(df.max()-df.min()))*20

df.sample()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
15,LP001032,Male,No,0,1,No,4950,0.0,3.357453,360.0,2,Urban,Y


In [31]:
def normalize(dataset):
    dataNorm=((dataset-dataset.min())/(dataset.max()-dataset.min()))*20
    dataNorm["ApplicantIncome"]=dataset["ApplicantIncome"]
    return dataNorm

In [35]:
# Normalizing full dataframe except ApplicantIncome Columns

# We don't want to normalize "ApplicantIncome"

 xz=normalize(df)
 xz.sample(5)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null int64
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       614 non-null int64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(3), int64(3), object(7)
memory usage: 62.4+ KB


In [44]:
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df)
df_normalized = pd.DataFrame(np_scaled)
df_normalized

ValueError: could not convert string to float: 'N'