# Data Preprocessing

# Example 1: StandardScaler

In [1]:
#Import Preprocessing from sklearn

from sklearn import preprocessing
import numpy as np


In [2]:
#Create object (X_train) to hold Numpy array

X_train = np.array([[1,-1,2],
                    [2,0,0],
                    [0,1,-1]])

In [3]:
#Use StandardScaler (from sklearn preprocessing) to fit
# scaling model on X_train

scaler = preprocessing.StandardScaler().fit(X_train)

In [4]:
#Use model to transform X_train

X_scaled = scaler.transform(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [5]:
#Obtain mean of scaled array
X_scaled.mean(axis=0)

array([0., 0., 0.])

In [6]:
#Obtain standard deviation of scaled array

X_scaled.std(axis = 0)

array([1., 1., 1.])

In [7]:
X_scaled = scaler.transform(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

# Example 2 - Min_Max_Scaler

In [8]:
#Create array data. Store in array object X_train

X_train = np.array([[1,-1,2],
                   [2,0,0],
                   [0,1,-1]])

In [9]:
#Intitalize MinMaxScaler()

min_max_scaler = preprocessing.MinMaxScaler()

In [10]:
#Apply MinMaxScaler to fit model on X_train

X_train_minmax = min_max_scaler.fit_transform(X_train)

In [11]:
#Display scaled array data - scaled to range [0,1]

X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [12]:
#Use instance of transformer to attempt transformation on 
# test data not seen during the training on X_train
X_test = np.array([[-3,-1,4]])

In [13]:
#Apply transformer to test data

X_test_minmax = min_max_scaler.transform(X_test)

In [14]:
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [15]:
#Notice that you have a negative value - showing that 
#The min_max_scaler scaling is not 100% perfect

# Example 2 - MaxAbsScaler

In [16]:
#MaxAbsScaler scales data in a way that the training data 
# lies within the range [-1,1] by dividing through the largest
# maximum value in each feature.


In [17]:
#Obtain array
X_train = np.array([[1,-1,2],
                   [2,0,0],
                   [0,1,-1]])

In [18]:
#Initialing MaxAbsScaler

max_abs_scaler = preprocessing.MaxAbsScaler()

In [19]:
#Apply transformer to X_train data
#Train MaxAbsScaler on X_train

X_train_maxabs = max_abs_scaler.fit_transform(X_train)

In [20]:
X_train_maxabs

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [21]:
#Create test (new) data

X_test = np.array([[-3,-1,4]])

In [22]:
#Apply transformer to test data

X_test_maxabs = max_abs_scaler.transform(X_test)

In [23]:
X_test_maxabs

array([[-1.5, -1. ,  2. ]])

In [24]:
#Notice that the scaling here is also not 100% perfect

# Example - Discretization of quantitative variable (NumPy array)

In [25]:
# load numpy
import numpy as np
# load pandas
import pandas as pd

In [26]:
#Case 1: Categorizing values in a NumPy 1-d array

#Create a numpy array with 10 integers
#We use Numpy's random module to generate random numbers
# between 25 and 200

# set a random seed to reproduce
np.random.seed(123)
# create 10 random integers  
x = np.random.randint(low=25, high=200, size=10)

In [27]:
#Display contents of x
x

array([134, 151,  91, 123,  42, 108, 131, 148,  82, 121])

In [28]:
#Sort the numers for convenience

x = np.sort(x)

In [29]:
x

array([ 42,  82,  91, 108, 121, 123, 131, 134, 148, 151])

In [30]:
#We can see above the numbers we generated 10 numbers for height ranging from 42 to 151

In [31]:
#We can use Numpy's digitize() function to discretize
#the quantitative variable x

#Lets consider simple binning/discretization where we use
# 50 as a threshold to bin/discretize/categorise our data into two categories

#In this case values less than 50 are in the 0 category
# values above 50 (the threshold) are in the 1 category

In [32]:
# digitize examples
np.digitize(x,bins=[50])

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [33]:
#Since the "bins" argument is a list,we can also specify multiple
# binning or discretizing conditions

#Below, we discretize/bin into three categories

In [34]:
#We obtain three categories with values less than 50 (category 0)
#values greater than 50 but less than 100 (category 1)
#values greater than 100 (category 2)

np.digitize(x,[50,100])

array([0, 1, 1, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [35]:
#Bin/categorize data into 4 bins

np.digitize(x,[25,50,100])


array([1, 2, 2, 3, 3, 3, 3, 3, 3, 3], dtype=int64)

In [36]:
#Categorizing with Pandas dataframe - using Pandas cut()
# function

In [37]:
#We can use Pandas cut() function to discretize/categorize a 
#quantitative variable and produce the same results as NumPy’s digitize function

In [38]:
#Case 2 - Working with a Pandas dataframe

#Create a Pandas dataframe with the data we stored in object 
# x above

df = pd.DataFrame({"height":x})
df.head() # lists the first few observations in dataframe df

Unnamed: 0,height
0,42
1,82
2,91
3,108
4,121


In [39]:
#Show entire dataframe
df

Unnamed: 0,height
0,42
1,82
2,91
3,108
4,121
5,123
6,131
7,134
8,148
9,151


In [40]:
#Lets categorize the height feature into four categories
#using Pandas cut() function

#cut() takes the feature we want to categorize as input

#We need to specify bins/categories such that:
#height values between 0 and 25 are in one category
#height values between 25 and 50 are in a second category
#height values between 50 and 100 are in a third category
#height values between 100 and 200 are in a fourth category
#Note we need to specify both the lower and upper end of the bins
# for categorizing


df['binned']=pd.cut(x=df['height'], bins=[0,25,50,100,200])

In [41]:
#Show height values and their corresponsing bins

df.head()

Unnamed: 0,height,binned
0,42,"(25, 50]"
1,82,"(50, 100]"
2,91,"(50, 100]"
3,108,"(100, 200]"
4,121,"(100, 200]"


In [42]:
df

Unnamed: 0,height,binned
0,42,"(25, 50]"
1,82,"(50, 100]"
2,91,"(50, 100]"
3,108,"(100, 200]"
4,121,"(100, 200]"
5,123,"(100, 200]"
6,131,"(100, 200]"
7,134,"(100, 200]"
8,148,"(100, 200]"
9,151,"(100, 200]"


In [43]:
#Another example

#If we want to have 4 categories, we can provide labels as integers


df['height_bin']=pd.cut(x = df['height'],
                        bins = [0,25,50,100,200], 
                        labels = [0, 1, 2,3])
df


Unnamed: 0,height,binned,height_bin
0,42,"(25, 50]",1
1,82,"(50, 100]",2
2,91,"(50, 100]",2
3,108,"(100, 200]",3
4,121,"(100, 200]",3
5,123,"(100, 200]",3
6,131,"(100, 200]",3
7,134,"(100, 200]",3
8,148,"(100, 200]",3
9,151,"(100, 200]",3


In [44]:
#We can be more decriptive

df['height_bin']=pd.cut(x=df['height'], bins=[0,25,50,100,200], 
                        labels=["very short", " short", "medium","tall"])

df.head()

Unnamed: 0,height,binned,height_bin
0,42,"(25, 50]",short
1,82,"(50, 100]",medium
2,91,"(50, 100]",medium
3,108,"(100, 200]",tall
4,121,"(100, 200]",tall


In [45]:
#Show entire dataframe

df

Unnamed: 0,height,binned,height_bin
0,42,"(25, 50]",short
1,82,"(50, 100]",medium
2,91,"(50, 100]",medium
3,108,"(100, 200]",tall
4,121,"(100, 200]",tall
5,123,"(100, 200]",tall
6,131,"(100, 200]",tall
7,134,"(100, 200]",tall
8,148,"(100, 200]",tall
9,151,"(100, 200]",tall


In [46]:
#Having categorized our Height feature, we can use the updated data
# to build a k-means clustering model

# Example: Dealing with Missing Values

In [47]:
#Case 1: Checking for missing values in a Pandas DataFrame

#Use isnull() and notnull() to check whether a value is NaN or not

In [48]:
# importing pandas as pd
import pandas as pd
 
# importing numpy as np
import numpy as np
 
# creat dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
 
# creating a dataframe from list
df = pd.DataFrame(dict)
 


In [49]:
#Display dataframe

df

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [50]:
# using isnull() function on df - returns False or True
df.isnull()

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [51]:
#Case 2: Filling missing values using fillna()


In [52]:
# importing pandas as pd
import pandas as pd
 
# importing numpy as np
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
 
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
 


In [53]:
# filling missing value using fillna()  - replace/hard code missing 
# values with a 0
df.fillna(0)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


In [54]:
# filling missing value using fillna()  - replace/hard code missing 
#values with the arithmetic mean of the values in each feature

df.fillna(df.mean())

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,72.666667
1,90.0,45.0,40.0
2,95.0,56.0,80.0
3,95.0,43.666667,98.0


In [55]:
#Using interpolate() to fill missing values


In [56]:
#Interpolating can work with None values in a dataset but uses various interpolation techniques to fill 
#the missing values rather than hard-coding

In [57]:
# importing pandas as pd
import pandas as pd
   
# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, None, 1],
                   "B":[None, 2, 54, 3, None],
                   "C":[20, 16, None, 3, 8],
                   "D":[14, 3, None, None, 6]})
   


In [58]:
# Print the dataframe
df

Unnamed: 0,A,B,C,D
0,12.0,,20.0,14.0
1,4.0,2.0,16.0,3.0
2,5.0,54.0,,
3,,3.0,3.0,
4,1.0,,8.0,6.0


In [59]:
#Use linear interpolate method to replace missing values

# to interpolate the missing values
df.interpolate(method ='linear', limit_direction ='forward')

Unnamed: 0,A,B,C,D
0,12.0,,20.0,14.0
1,4.0,2.0,16.0,3.0
2,5.0,54.0,9.5,4.0
3,3.0,3.0,3.0,5.0
4,1.0,3.0,8.0,6.0


In [60]:
#The NaN in the first row is not replaced
# as the direction of filling values is forward
# and there is no previous value which could have been used in interpolation
# i.e. linear interpolation technique uses a previous observation value to generate the next value

#You can drop that NaN (see below) or replace with 0
# or some other number - say mean of the values in the feature

In [61]:
#Case 3: Dropping NaNs using dropna()

#Used to exclude rows and columns with NaN values

In [62]:
#Drop rows with at least 1 null value

# importing pandas as pd
import pandas as pd
 
# importing numpy as np
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}
 
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
   
df

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


In [63]:
#Drop rows with at least one NaN value

# using dropna() function 
df.dropna()

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


In [64]:
#We see that the data frame is reduced to
# just one row that has no NaN values

In [65]:
# using dropna() function and assign to a new name df2

df2 = df.dropna()

In [66]:
#Print df2
df2

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


In [67]:
#Drop rows if all values in that row are missing

# importing pandas as pd
import pandas as pd
 
# importing numpy as np
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, np.nan, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, np.nan, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}
 
# creating a dataframe from dictionary of lists
df = pd.DataFrame(dict)
   
df

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52.0,
1,,,,
2,,45.0,80.0,
3,95.0,56.0,98.0,65.0


In [68]:
# use dropna() function to drop rows with all NaN values
df.dropna(how = 'all')

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52.0,
2,,45.0,80.0,
3,95.0,56.0,98.0,65.0


In [69]:
#Dropping columns with at least 1 null value

# importing pandas as pd
import pandas as pd
  
# importing numpy as np
import numpy as np
  
# dictionary of lists
dict = {'First Score':[100, np.nan, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, np.nan, 80, 98],
        'Fourth Score':[60, 67, 68, 65]}
 
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
    
df

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52.0,60
1,,,,67
2,,45.0,80.0,68
3,95.0,56.0,98.0,65


In [70]:
# using dropna() function - displays column with no NaN values
df.dropna(axis = 1)# axis = 1 references columns

Unnamed: 0,Fourth Score
0,60
1,67
2,68
3,65


In [71]:
# using dropna() function - displays rows with no NaN values
df.dropna(axis = 0)# axis = 0 references rows

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52.0,60
3,95.0,56.0,98.0,65


# Example: Normalization using Min Max Values

In [72]:
# import necessary packages
import numpy as np
  
# create an array
data = np.array([[10, 20], [30, 40], 
                 [5, 15], [0, 10]])
  
  
data

array([[10, 20],
       [30, 40],
       [ 5, 15],
       [ 0, 10]])

In [73]:
#Normalize NumPy array data using Min-Max Values

normalizedData = (data-np.min(data))/(np.max(data)-np.min(data))

In [74]:
# normalized data using min max value
print(normalizedData)

[[0.25  0.5  ]
 [0.75  1.   ]
 [0.125 0.375]
 [0.    0.25 ]]


# Example: Normalization using sklearn Min Max Scaler

In [75]:
# import necessary packages
import numpy as np
from sklearn import preprocessing as p
  
# create an array
data = np.array([[10, 20], [30, 40],
                 [5, 15], [0, 10]])
  
min_max_scaler = p.MinMaxScaler() #setup an instance of MinMaxScaler
normalizedData = min_max_scaler.fit_transform(data)#fit transform the data
  
# normalized data using MinMaxScaler
print(normalizedData)

[[0.33333333 0.33333333]
 [1.         1.        ]
 [0.16666667 0.16666667]
 [0.         0.        ]]


In [76]:
#Example: Normalization using Min Max Scaler on a Pandas Dataframe

In [77]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [78]:
#Import dataset from URL

cols = ['loan_amount', 'interest_rate', 'total_credit_utilized']
data = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/openintro/loan50.csv", usecols=cols)

In [79]:
#Obtain summary info on dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_credit_utilized  50 non-null     int64  
 1   loan_amount            50 non-null     int64  
 2   interest_rate          50 non-null     float64
dtypes: float64(1), int64(2)
memory usage: 1.3 KB


In [80]:
#Display summary statistics for each feature

data.describe()

Unnamed: 0,total_credit_utilized,loan_amount,interest_rate
count,50.0,50.0,50.0
mean,61546.54,17083.0,11.5672
std,63778.074324,10455.456343,5.052115
min,2872.0,3000.0,5.31
25%,25693.5,7125.0,7.96
50%,48005.5,15500.0,9.93
75%,76796.25,24000.0,13.715
max,373361.0,40000.0,26.3


In [81]:
#Display summary statistics for each feature (transposed)

data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_credit_utilized,50.0,61546.54,63778.074324,2872.0,25693.5,48005.5,76796.25,373361.0
loan_amount,50.0,17083.0,10455.456343,3000.0,7125.0,15500.0,24000.0,40000.0
interest_rate,50.0,11.5672,5.052115,5.31,7.96,9.93,13.715,26.3


In [82]:
#Normalization (Min-Max Scalar) :
#In this approach, the data is scaled to a fixed range — usually 0 to 1.
#In contrast to standardization, the cost of having this bounded range is that we will end up with smaller standard deviations, 
#which can suppress the effect of outliers. Thus MinMax Scalar is sensitive to outliers

In [83]:
# import MinMaxScalar from Scikit-learn and apply it to our dataset

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
data_scaled = scaler.fit_transform(data)

In [84]:
#Check means and standard deviations
print('means (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.mean(axis=0))
print('std (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.std(axis=0))

means (total_credit_utilized, loan_amount and interest_rate):  [0.15837053 0.38062162 0.29810386]
std (total_credit_utilized, loan_amount and interest_rate):  [0.17041551 0.27973983 0.23827245]


In [85]:
#After MinMaxScaling, the distributions are not centered at zero and the standard deviation is not 1

In [86]:
#Check minimum and maximum values
print('Min (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.min(axis=0))
print('Max (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.max(axis=0))

Min (total_credit_utilized, loan_amount and interest_rate):  [0. 0. 0.]
Max (total_credit_utilized, loan_amount and interest_rate):  [1. 1. 1.]


In [87]:
#But the minimum and maximum values are standardized across variables, different from what occurs with standardization

# Example: Normalization using numpy.linalg.norm

In [88]:
# import necessary packages
import numpy as np
  
# create an array
data = np.array([[10, 20], [30, 40],
                 [5, 15], [0, 10]])
  
normalizedData = data/np.linalg.norm(data)
  
# normalized data using linalg.norm
print(normalizedData)

[[0.17277369 0.34554737]
 [0.51832106 0.69109474]
 [0.08638684 0.25916053]
 [0.         0.17277369]]


# Example: Normalization using Maths Formula

In [89]:
# import necessary packages
import numpy as np
  
# create an array
data = np.array([[10, 20], [30, 40], 
                 [5, 15], [0, 10]])
  
normalizedData = data/np.sqrt(np.sum(data**2))
  
# normalized data using sum of squares
print(normalizedData)

[[0.17277369 0.34554737]
 [0.51832106 0.69109474]
 [0.08638684 0.25916053]
 [0.         0.17277369]]


# Example: Normalizing a 2 dimensional NumPy array

In [90]:
# import NumPy module
import numpy as np

In [91]:
# explicit function to normalize array
def normalize_2d(matrix):
    norm = np.linalg.norm(matrix)
    matrix = matrix/norm  # normalized matrix
    return matrix

In [92]:
# create 1d array starting from -2 and ending at 13
array = np.arange(16) - 2

In [93]:
#Display 1d NumPy array
array

array([-2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])

In [94]:
# Convert 1d array to a matrix (2d array)
matrix = array.reshape(4, 4)

In [95]:
#Display 2d array

matrix

array([[-2, -1,  0,  1],
       [ 2,  3,  4,  5],
       [ 6,  7,  8,  9],
       [10, 11, 12, 13]])

In [96]:
print("Simple Matrix \n", matrix)
normalized_matrix = normalize_2d(matrix)

Simple Matrix 
 [[-2 -1  0  1]
 [ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]


In [97]:
print("\nNormalized Matrix \n", normalized_matrix)


Normalized Matrix 
 [[-0.0696733  -0.03483665  0.          0.03483665]
 [ 0.0696733   0.10450995  0.1393466   0.17418325]
 [ 0.2090199   0.24385656  0.27869321  0.31352986]
 [ 0.34836651  0.38320316  0.41803981  0.45287646]]


In [98]:
#The entire code in one cell is shown below

In [99]:
# import module
import numpy as np
 
# explicit function to normalize array
def normalize_2d(matrix):
    norm = np.linalg.norm(matrix)
    matrix = matrix/norm  # normalized matrix
    return matrix
 
# gives and array starting from -2
# and ending at 13
array = np.arange(16) - 2
 
# converts 1d array to a matrix
matrix = array.reshape(4, 4)
print("Simple Matrix \n", matrix)
normalized_matrix = normalize_2d(matrix)
print("\nNormalized Matrix \n", normalized_matrix)

Simple Matrix 
 [[-2 -1  0  1]
 [ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]

Normalized Matrix 
 [[-0.0696733  -0.03483665  0.          0.03483665]
 [ 0.0696733   0.10450995  0.1393466   0.17418325]
 [ 0.2090199   0.24385656  0.27869321  0.31352986]
 [ 0.34836651  0.38320316  0.41803981  0.45287646]]


# Standardizing Variables in a Pandas Dataframe

In [100]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [101]:
#Import dataset from URL

cols = ['loan_amount', 'interest_rate', 'total_credit_utilized']
data = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/openintro/loan50.csv", usecols=cols)

In [102]:
#Obtain summary info on dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_credit_utilized  50 non-null     int64  
 1   loan_amount            50 non-null     int64  
 2   interest_rate          50 non-null     float64
dtypes: float64(1), int64(2)
memory usage: 1.3 KB


In [103]:
#Display summary statistics for each feature

data.describe()

Unnamed: 0,total_credit_utilized,loan_amount,interest_rate
count,50.0,50.0,50.0
mean,61546.54,17083.0,11.5672
std,63778.074324,10455.456343,5.052115
min,2872.0,3000.0,5.31
25%,25693.5,7125.0,7.96
50%,48005.5,15500.0,9.93
75%,76796.25,24000.0,13.715
max,373361.0,40000.0,26.3


In [104]:
#Display summary statistics for each feature (transposed)

data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_credit_utilized,50.0,61546.54,63778.074324,2872.0,25693.5,48005.5,76796.25,373361.0
loan_amount,50.0,17083.0,10455.456343,3000.0,7125.0,15500.0,24000.0,40000.0
interest_rate,50.0,11.5672,5.052115,5.31,7.96,9.93,13.715,26.3


# Example - Standardization: Standard Scaler

In [105]:
#Standardization (Standard Scalar) :

#Standardization means centering the variable at zero and standardizing the variance at 1.
#The procedure involves subtracting the mean (measure of location) from each observation 
# and then dividing by the standard deviation (measure of scale):

#The result of standardization is that the features will be rescaled 
#so that they have the properties of a standard normal distribution with mean = 0 and standard deviation = 1

In [106]:
#StandardScaler from sci-kit-learn removes the mean and scales the data to unit variance. 
#We can import the StandardScalar method from sci-kit learn and apply it to our dataset.


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #initiate instance of StandardScaler
data_scaled = scaler.fit_transform(data) #fit transform to our data

In [107]:
#Display standardised data

data_scaled

array([[-0.45381483,  0.47505531, -0.13340428],
       [ 0.26599998, -1.07078259, -0.32935182],
       [ 0.27993791,  0.76489992,  2.94577134],
       [-0.29254667, -1.07078259, -0.32935182],
       [-0.01673407,  0.76489992, -0.42732559],
       [ 0.16813355, -1.03213665, -0.32935182],
       [-0.92931993, -1.3606272 ,  1.10426436],
       [-0.53017251, -0.24955621, -1.09714626],
       [-0.59919693, -0.68432312, -0.71924744],
       [-0.46740431,  0.13690327,  0.21050364],
       [-0.49575533, -0.00801903,  1.10426436],
       [ 0.54361817, -0.49109338, -1.25110504],
       [-0.53435389, -0.1046339 , -0.84321425],
       [-0.52879455, -0.05632647, -1.25110504],
       [-0.63592654, -1.3606272 , -0.7212469 ],
       [ 0.74361161,  1.19000534,  2.65584896],
       [ 0.65686384, -0.34617108,  1.29821243],
       [-0.62208364,  0.10791881, -0.22937859],
       [-0.75587192, -1.07078259, -0.7212469 ],
       [-0.26482919,  0.08859583,  1.57013963],
       [ 1.00491609, -1.16739746,  0.502

In [108]:
#Print mean and standard deviation of standardized features:

print(data_scaled.mean(axis=0))
print(data_scaled.std(axis=0))


[-2.77555756e-18 -1.44328993e-17  5.32907052e-17]
[1. 1. 1.]


In [109]:
#Note that the mean is now effectively 0 and variance is 1

In [110]:
#Print minimum and maximum values

print('Min values (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.min(axis=0))
print('Max values (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.max(axis=0))

Min values (total_credit_utilized, loan_amount and interest_rate):  [-0.92931993 -1.3606272  -1.25110504]
Max values (total_credit_utilized, loan_amount and interest_rate):  [4.93869051 2.21412295 2.94577134]


# Example - Standardization using Robust Scaler

In [111]:
#Robust Scalar (Scaling to median and quantiles) :
#Scaling using median and quantiles consists of subtracting the median to all the observations 
#and then dividing by the interquartile difference. It Scales features using statistics that are robust to outliers.

In [112]:
#Import RobustScaler

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler() 
data_scaled = scaler.fit_transform(data)

In [113]:
#Print means and standard deviation

print('means (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.mean(axis=0))
print('std (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.std(axis=0))

means (total_credit_utilized, loan_amount and interest_rate):  [0.26497674 0.09380741 0.28448306]
std (total_credit_utilized, loan_amount and interest_rate):  [1.23549266 0.61335548 0.86904233]


In [114]:
#As you can see, the distributions are not centered in zero and the standard deviation is not 1

In [115]:
#Print minimum and maximum values

print('Min (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.min(axis=0))
print('Max (total_credit_utilized, loan_amount and interest_rate): ', data_scaled.max(axis=0)

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_11712/2185913808.py, line 4)

In [None]:
#See that Neither are the minimum and maximum values set to a certain upper and lower boundaries like in the MinMaxScaler

# Example: One Hot Encoding

In [None]:
import pandas as pd

df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})

In [None]:
df

In [None]:


df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
pd.get_dummies(df['country'], prefix='country')

In [None]:
import pandas as pd

# df now has two columns: name and country
df = pd.DataFrame({
        'name': ['josef','michael','john','bawool','klaus'],
        'country': ['russia', 'germany', 'australia','korea','germany']
    })

In [None]:
#Create one hot encoding for country column

dummies = pd.get_dummies(df['country'], prefix='country')

In [None]:
#Display encoding

dummies

In [None]:
# use pd.concat to join the new columns with your original dataframe
df = pd.concat([df,dummies],axis=1)

In [None]:
# use pd.concat to join the new columns with your original dataframe
df = pd.concat([df,pd.get_dummies(df['country'], prefix='country')],axis=1)

In [None]:
# now drop the original 'country' column (you don't need it anymore)
df.drop(['country'],axis=1, inplace=True)

In [None]:
#Display final dataframe with one hot encoding

df

In [None]:
#Entire code is as below

In [None]:
import pandas as pd

# df now has two columns: name and country
df = pd.DataFrame({
        'name': ['josef','michael','john','bawool','klaus'],
        'country': ['russia', 'germany', 'australia','korea','germany']
    })

# use pd.concat to join the new columns with your original dataframe
df = pd.concat([df,pd.get_dummies(df['country'], prefix='country')],axis=1)

# now drop the original 'country' column (you don't need it anymore)
df.drop(['country'],axis=1, inplace=True)

In [None]:
#Case: when you have NaN as a category in the original dataset
#Create NaN as a separate category

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'country': ['germany',np.nan,'germany','united kingdom','america','united kingdom']
})

pd.get_dummies(df['country'], dummy_na=True)

In [None]:
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file (from UCI ML Repository) and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()