In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv('preprocessExample.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    9 non-null      object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [4]:
data.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [None]:
# Data Preprocessing Phase
# Need: Assuming the final goal of this dataset is to perform Inferential Stats

In [None]:
#Expectations from Inferential Stats:
# 1. Data to be COMPLETE
# 2. Data must be strictly NUMERIC

In [None]:
#Preprocessing  Task
# 1. Check and handle Missing Data on all columns
# 2. Check and Handle Categorical Columns
# 3. Check and Handle Ordinal Columns
# 4. Perform Data Standardization

In [None]:
# 1. Check and handling Missing Data on all columns
# The process of dealing with missing values such that the number of records is not impacted is called IMPUTATION
#
# ==============================================================================
# 1. Statistical Perspective
# ==============================================================================
# a. Numerical Columns
#     a. Continuous ND: Replace missing values with the MEAN value of the column
#     b. Discrete ND: Replace missing values with the MEDIAN value of the column
#
# b. Non-numerical columns
#     Replace missing values with Mode value of the column
#
#
# ==============================================================================
# 2. Domain Perspective
# ==============================================================================
#
# Replace missing data with the default value of the column
#
# e.g. Real Estate in Mumbai, India --- MMRDA --- Guidelines and Rules when it comes to building a tower
#
# Car parking ---- NaN
# 1. 2BHK flat ------ 1 Car Parking
# 2. 3BHK flat ------ 2 Car Parking
# 3. 4BHK and Above - 3 Car Parking
#
# ==============================================================================
# 3. Hybrid approach
# ==============================================================================
#
# Some columns you will follow stat approach while others follow Domain Approach

In [5]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [6]:
# Another way to determine missing values
data.isna().sum()

Country      1
Age          1
Salary       1
Purchased    0
dtype: int64

In [None]:
# Stats Perspective
#
#Country      - Categorical ----------------> mode
#Age          - Numerical (Discrete) -------> median
#Salary       - Numerical (Continuous) -----> mean

In [7]:
#Country      - Categorical ----------------> mode

data['Country'].mode()

0    France
Name: Country, dtype: object

In [8]:
#Replace NaN with mode value
data['Country'].fillna( data['Country'].mode()[0] , inplace=True)

In [9]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [10]:
data['Age'].median()

38.0

In [11]:
#Age          - Numerical (Discrete) -------> median

data['Age'].fillna( data['Age'].median() , inplace=True )

In [None]:
#Convert datatype of a column
#data['Age'].astype('int')

In [12]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [13]:
#Salary       - Numerical (Continuous) -----> mean

data['Salary'].fillna( data['Salary'].mean() , inplace=True)

In [14]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [15]:
#Domain Approach --- Realestate Example

In [16]:
dataExample = pd.DataFrame([[2,1],[3,np.nan],[1,1],[4,np.nan]],columns=['n_bedrooms','parking'])
dataExample

Unnamed: 0,n_bedrooms,parking
0,2,1.0
1,3,
2,1,1.0
3,4,


In [17]:
# replace NaN with the default value
#
# e.g. Real Estate in Mumbai, India --- MMRDA --- Guidelines and Rules when it comes to building a tower
#
# Car parking ---- NaN
# 1. 2BHK flat ------ 1 Car Parking
# 2. 3BHK flat ------ 2 Car Parking
# 3. 4BHK and Above - 3 Car Parking
#
# Create a function that accept n_bedroom and return parking
#


In [18]:
def getParking(n_bedrooms):
    if n_bedrooms == 2:
        return 1
    elif n_bedrooms == 3:
        return 2
    elif n_bedrooms >= 4:
        return 3
    else:
        return None

In [19]:
#Replace NaN with values generated by the function
#

dataExample['parking'] = dataExample.apply( lambda df: getParking(df['n_bedrooms'])if pd.isna(df['parking']) else df['parking'] ,axis = 1 )

dataExample

Unnamed: 0,n_bedrooms,parking
0,2,1.0
1,3,2.0
2,1,1.0
3,4,3.0


In [20]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [21]:
#To create Dummy variables in Pandas
#

pd.get_dummies(data.Country)

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,1,0,0
9,1,0,0


In [22]:
finalData = pd.concat( [pd.get_dummies(data.Country) , data.iloc[:,[1,2,3]]] , axis = 1)
finalData

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,No
1,0,0,1,27.0,48000.0,Yes
2,0,1,0,30.0,54000.0,No
3,0,0,1,38.0,61000.0,No
4,0,1,0,40.0,63777.777778,Yes
5,1,0,0,35.0,58000.0,Yes
6,0,0,1,38.0,52000.0,No
7,1,0,0,48.0,79000.0,Yes
8,1,0,0,50.0,83000.0,No
9,1,0,0,37.0,67000.0,Yes
