In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('preprocessExample.csv')

In [3]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    9 non-null      object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [5]:
data.isna().sum()

Country      1
Age          1
Salary       1
Purchased    0
dtype: int64

In [6]:
# Seperate my data as features and label
# Ensure data is in numpy array format ------ Faster Execution

features = data.iloc[:,[0,1,2]].values
label = data.iloc[:,[3]].values

# **Step1: Ensure data is complete**

In [7]:
# Perform Imputation --- Stat Approach
# Country Column ----- Mode
# Age Column --------- Median
# Salary Column ------ Mean

# SimpleImputer ---- scikit learn package (impute package)

In [8]:
# Create Individual Objects for each strategy
#Code to Impute Country Column


# Step1: Import relevant package

from sklearn.impute import SimpleImputer

# Step2: Instantiate an object for SimpleImputer

siForCountryColumn = SimpleImputer(strategy='most_frequent' , missing_values=np.nan)

# Step3: Fit the object with the data column (Country column)\
# Fit function internally will calc mode value

siForCountryColumn.fit(features[:,[0]])

# Step4: Transform the columns
# Search for np.nan and replace it with calc mode value

features[:,[0]] = siForCountryColumn.transform(features[:,[0]])


In [9]:
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['France', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [11]:
siForCountryColumn.statistics_

array(['France'], dtype=object)

In [12]:
# Create Individual Objects for each strategy
# Code to Impute Age Column


# Step1: Import relevant package

from sklearn.impute import SimpleImputer

# Step2: Instantiate an object for SimpleImputer

siForAgeColumn = SimpleImputer(strategy='median' , missing_values=np.nan)

# Step3: Fit the object with the data column (Country column)\
# Fit function internally will calc mode value

siForAgeColumn.fit(features[:,[1]])

# Step4: Transform the columns
# Search for np.nan and replace it with calc mode value

features[:,[1]] = siForAgeColumn.transform(features[:,[1]])

In [13]:
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 52000.0],
       ['France', 48.0, 79000.0],
       ['France', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [14]:
# Create Individual Objects for each strategy
# Code to Impute Salary Column


# Step1: Import relevant package

from sklearn.impute import SimpleImputer

# Step2: Instantiate an object for SimpleImputer

siForSalaryColumn = SimpleImputer(strategy='mean' , missing_values=np.nan)

# Step3: Fit the object with the data column (Country column)\
# Fit function internally will calc mode value

siForSalaryColumn.fit(features[:,[2]])

# Step4: Transform the columns
# Search for np.nan and replace it with calc mode value

features[:,[2]] = siForSalaryColumn.transform(features[:,[2]])

In [15]:
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 52000.0],
       ['France', 48.0, 79000.0],
       ['France', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# **Dealing with Categorical Data**

In [16]:
# Dealing with Categorical data
# Convert the column into Dummy Variables
# The same is achievable in pandas using get_dummies()

# For sci-kit learn, OneHotEncoder which is the part of preprocessing package of Sci-kit learn

from sklearn.preprocessing import OneHotEncoder
oheForCountry = OneHotEncoder(sparse=False)
fCountry = oheForCountry.fit_transform(features[:,[0]])

In [17]:
fCountry

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [18]:
finalFeatureSet = np.concatenate((fCountry,features[:,[1,2]]) , axis = 1)
finalFeatureSet

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.0, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [19]:
oheForCountry.categories_

[array(['France', 'Germany', 'Spain'], dtype=object)]