# Dealing With Categorical Values

## Importing Libraries

In [20]:
import pandas as pd
import numpy as np


## Importing Dataset

In [21]:
df = pd.read_csv("Salary_Dataset.csv")

In [22]:
df.head(40)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No
5,Dubai,56642.0,2.9,No
6,Canada,60150.0,3.0,Yes
7,Australia,54445.0,3.2,No
8,Dubai,64445.0,3.2,Yes
9,Dubai,57189.0,3.7,No


# Lets perform Encoding

## Applying One-Hot Encoding

<h3> First Do it with pandas </h3>

In [23]:
# Get dummies with pandas in any variable
country_dummy = pd.get_dummies(df['country'])
country_dummy

Unnamed: 0,Australia,Canada,Dubai,USA
0,False,False,True,False
1,False,True,False,False
2,False,True,False,False
3,False,True,False,False
4,False,False,False,True
5,False,False,True,False
6,False,True,False,False
7,True,False,False,False
8,False,False,True,False
9,False,False,True,False


In [24]:
# concatenate the dataframes into original dataframes.
dataset = pd.concat([df, country_dummy], axis=1)

In [25]:
dataset.head()

Unnamed: 0,country,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,Dubai,39343.0,1.1,No,False,False,True,False
1,Canada,46205.0,1.3,Yes,False,True,False,False
2,Canada,37731.0,1.5,No,False,True,False,False
3,Canada,43525.0,2.0,No,False,True,False,False
4,USA,39891.0,2.2,No,False,False,False,True


In [26]:
# Drop the country column and rearrange the index of column.
dataset.drop('country', axis = 1, inplace=True)
dataset.head()

Unnamed: 0,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,39343.0,1.1,No,False,False,True,False
1,46205.0,1.3,Yes,False,True,False,False
2,37731.0,1.5,No,False,True,False,False
3,43525.0,2.0,No,False,True,False,False
4,39891.0,2.2,No,False,False,False,True


# <h3> Let's do it with Scikit-Learn </h3>
Also Restart your kernel if you are using same notebook


In [33]:
dataset.head()

Unnamed: 0,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,39343.0,1.1,0,False,False,True,False
1,46205.0,1.3,1,False,True,False,False
2,37731.0,1.5,0,False,True,False,False
3,43525.0,2.0,0,False,True,False,False
4,39891.0,2.2,0,False,False,False,True


In [31]:
# Lets first perform label encoding
# Labelencoding will be performed on Purchased column as it has only two unique value in it.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['Purchased'] = le.fit_transform(dataset['Purchased'])

In [32]:
dataset.head(5)

Unnamed: 0,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,39343.0,1.1,0,False,False,True,False
1,46205.0,1.3,1,False,True,False,False
2,37731.0,1.5,0,False,True,False,False
3,43525.0,2.0,0,False,True,False,False
4,39891.0,2.2,0,False,False,False,True


In [1]:
# Lets perform one-hot encoding on country column.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough' )
dataset = ct.fit_transform(dataset)

In [29]:
#Lets link the headers name.