# Encoding

In [1]:
import pandas as pd
df = pd.read_csv('supershops.csv')

In [2]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
Marketing Spend    50 non-null float64
Administration     50 non-null float64
Transport          49 non-null float64
Area               50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


# Missing Data Handle
We drop the row with NaN value if there is more than one NaN value for a single row

In [4]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [5]:
df.Transport.head()

0    471784.10
1    443898.53
2    407934.54
3    383199.62
4    366168.42
Name: Transport, dtype: float64

In [6]:
df.Transport.mean() #Replace NaN value with mean

215331.73244897963

In [7]:
df.Transport.median() # Replace NaN value with median or we can just drop the row

214634.81

In [8]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()
df5 = df.copy()

In [9]:
df1.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [10]:
df1 = df1.dropna() #Droping all the NaN value

In [11]:
df1.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [12]:
df2.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [13]:
df2 = df2.dropna()

In [14]:
#inplace 
df3.shape

(50, 5)

In [15]:
df3.dropna(inplace=False, axis = 0) #False = will not update row, axis = 0: delete row
df3.dropna(inplace=False, axis = 1) #True = will not update row, axis = 1: delete column
#df3.dropna(inplace=True) #will update row

Unnamed: 0,Marketing Spend,Administration,Area,Profit
0,114523.61,136897.8,Dhaka,192261.83
1,162597.7,151377.59,Ctg,191792.06
2,153441.51,101145.55,Rangpur,191050.39
3,144372.41,118671.85,Dhaka,182901.99
4,142107.34,91391.77,Rangpur,166187.94
5,131876.9,99814.71,Dhaka,156991.12
6,134615.46,147198.87,Ctg,156122.51
7,130298.13,145530.06,Rangpur,155752.6
8,120542.52,148718.95,Dhaka,152211.77
9,123334.88,108679.17,Ctg,149759.96


# Filling NaN value with mean value

In [16]:
df3.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [17]:
df3.Transport = df3.Transport.fillna(df3.Transport.mean()) #will fill the nun value with the mean

In [18]:
df3.shape

(50, 5)

In [19]:
df4 = df3.copy()
df5 = df3.copy()

# Without Encoding Techniques

In [20]:
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [21]:
df3.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [22]:
df3.Area = df3.Area.replace(['Dhaka', 'Ctg', 'Rangpur'],[1,2,3])

In [23]:
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,2,191792.06
2,153441.51,101145.55,407934.54,3,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,3,166187.94


# Label Encoder

In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [25]:
df4.Area = le.fit_transform(df4.Area)

In [26]:
df4.Area.head()

0    1
1    0
2    2
3    1
4    2
Name: Area, dtype: int32

In [27]:
df5.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

# Label Encoder using loop

In [28]:
df6 = df5.copy()
df7 = df5.copy()

In [29]:
import numpy as np
#import warnings 
#warnings.filterwarnings('ignore') # for ignoring warning
df5.columns

Index(['Marketing Spend', 'Administration', 'Transport', 'Area', 'Profit'], dtype='object')

In [30]:
for column in df5.columns:
    if df5[column].dtype == np.number:
        continue
    df5[column] = le.fit_transform(df5[column])

In [31]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [32]:
#another way to ignore warning
from pandas.core.dtypes.common import is_numeric_dtype

In [33]:
df6.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [34]:
for column in df5.columns:
    if is_numeric_dtype(df6[column]):
        continue
    df6[column] = le.fit_transform(df6[column])

In [35]:
df6.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# One-Hot-Encoder

In [36]:
df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [37]:
df8 = df7.copy()
df9 = df7.copy()
df10 = df7.copy()

In [38]:
dummy = pd.get_dummies(df7['Area'], prefix = 'Area', drop_first = True)

In [39]:
dummy.head()

Unnamed: 0,Area_Dhaka,Area_Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [40]:
df7 = df7.drop('Area', axis = 1)

In [41]:
df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [42]:
new_df = pd.concat([df7,dummy],axis=1)

In [43]:
new_df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


In [44]:
y = df7.Profit
x = df7.drop('Profit', axis =1)

In [45]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [46]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [47]:
df8.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [48]:
#col = ['Area']
#for column in col:
#    one = pd.get_dummies(df8['Area'])
#    df8 = pd.concat([df8,one],axis=1).drop('Area',axis=1)

In [49]:
one = pd.get_dummies(df8['Area'])
df8 = pd.concat([df8,one],axis=1).drop('Area',axis=1)

In [50]:
df8.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Ctg,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,0,1,0
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,0,1
3,144372.41,118671.85,383199.62,182901.99,0,1,0
4,142107.34,91391.77,366168.42,166187.94,0,0,1


# Ordinal Encoding

In [51]:
df9.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [52]:
city = ['Dhaka', 'Ctg', 'Rangpur']
from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder(categories=[city])

In [53]:
encoded = ordinal.fit_transform(df9[['Area']])

In [54]:
en = pd.DataFrame(encoded,columns=['Area'])

In [55]:
en.head()

Unnamed: 0,Area
0,0.0
1,1.0
2,2.0
3,0.0
4,2.0


In [56]:
df9.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [57]:
df9 = df9.drop(['Area'],axis=1)
df9 = pd.concat([df9,en],axis=1)

In [58]:
df9.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area
0,114523.61,136897.8,471784.1,192261.83,0.0
1,162597.7,151377.59,443898.53,191792.06,1.0
2,153441.51,101145.55,407934.54,191050.39,2.0
3,144372.41,118671.85,383199.62,182901.99,0.0
4,142107.34,91391.77,366168.42,166187.94,2.0


In [60]:
df10.head()
df11 = df10.copy()

In [61]:
df10.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


# Ordinal Encoding with loop

In [62]:
#using loop
cols = ['Area']
for col in cols:
    unique=df10[col].unique()
    df10[col]=OrdinalEncoder(categories=[unique]).fit_transform(df10[[col]])

In [63]:
df10.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,0.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94
