## Data Cleaning

### One hot encoding

In [2]:
# importing resources
import numpy as np
import pandas as pd

In [3]:
# loading dataset
df = pd.read_csv(r"C:\DS\tips\tips.csv")

In [4]:
df.shape

(244, 7)

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
# getting dummy variable
dummy_df = pd.get_dummies(df)
dummy_df.head()


Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
2,21.01,3.5,3,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0


In [7]:
# we get some redundant columns using this method - called k dummy variable
# let us remove 1 - called k-1 dummy variables
dummy_df = pd.get_dummies(df, drop_first=True)
dummy_df.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,1.01,2,0,0,0,1,0,0
1,10.34,1.66,3,1,0,0,1,0,0
2,21.01,3.5,3,1,0,0,1,0,0
3,23.68,3.31,2,1,0,0,1,0,0
4,24.59,3.61,4,0,0,0,1,0,0


### One-Hot Encoding with Scikit Learn

In [8]:
# import resources
from sklearn.preprocessing import OneHotEncoder

In [27]:
ohe = OneHotEncoder(sparse=False, drop="first") # no sparse matching, with = F we will get 2D numpy array

In [28]:
ohe_df = ohe.fit_transform(df[['sex', 'smoker', 'day', 'time']])

In [29]:
ohe_df

array([[0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       ...,
       [1., 1., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [30]:
# copying column names
dummy_df = pd.get_dummies(df, drop_first = True)
dummy_df.keys()

Index(['total_bill', 'tip', 'size', 'sex_Male', 'smoker_Yes', 'day_Sat',
       'day_Sun', 'day_Thur', 'time_Lunch'],
      dtype='object')

In [32]:
# converting it to df
ohe_df2 = pd.DataFrame(ohe_df, columns = ['sex_Male', 'smoker_Yes', 'day_Sat',
       'day_Sun', 'day_Thur', 'time_Lunch'])
ohe_df2.head()

Unnamed: 0,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0


### Label Encoding and Ordinal Encoding

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
# house price prediction data load
df = pd.read_csv(r"C:\DS\house\train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [35]:
# for tutorial, we will only work on 2 features
df2 = df[['KitchenQual', 'BldgType']]
df2.head()

Unnamed: 0,KitchenQual,BldgType
0,Gd,1Fam
1,TA,1Fam
2,Gd,1Fam
3,Gd,1Fam
4,Gd,1Fam


In [36]:
labelenc = LabelEncoder()

In [37]:
labelenc.fit_transform(df2['BldgType'])

array([0, 0, 0, ..., 0, 0, 0])

In [38]:
# we get array
df2['BldgType_encoded'] = labelenc.fit_transform(df2['BldgType'])
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['BldgType_encoded'] = labelenc.fit_transform(df2['BldgType'])


Unnamed: 0,KitchenQual,BldgType,BldgType_encoded
0,Gd,1Fam,0
1,TA,1Fam,0
2,Gd,1Fam,0
3,Gd,1Fam,0
4,Gd,1Fam,0


In [39]:
df2.tail()

Unnamed: 0,KitchenQual,BldgType,BldgType_encoded
1455,TA,1Fam,0
1456,TA,1Fam,0
1457,Gd,1Fam,0
1458,Gd,1Fam,0
1459,TA,1Fam,0


In [40]:
# we dont see alll rows, but let us see unique values
df2['BldgType_encoded'].value_counts()

0    1220
4     114
2      52
3      43
1      31
Name: BldgType_encoded, dtype: int64

In [41]:
# let us look at order labeling with Kitchen Quality example
df2['KitchenQual'].unique()

array(['Gd', 'TA', 'Ex', 'Fa'], dtype=object)

In [42]:
# it is excellent, good, typical/average, fair - so ordinal var
# define label using dictionary
ord_lbl = {"Ex":4, "Gd":3, "TA":2, "Fa":1}

In [43]:
df2['KitchenQual_ord_enc'] = df2['KitchenQual'].map(ord_lbl)
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['KitchenQual_ord_enc'] = df2['KitchenQual'].map(ord_lbl)


Unnamed: 0,KitchenQual,BldgType,BldgType_encoded,KitchenQual_ord_enc
0,Gd,1Fam,0,3
1,TA,1Fam,0,2
2,Gd,1Fam,0,3
3,Gd,1Fam,0,3
4,Gd,1Fam,0,3


### using ordinal encoder from scikkit learn

In [44]:
# resources
from sklearn.preprocessing import OrdinalEncoder

In [45]:
oe = OrdinalEncoder()

In [52]:
df2['KitchenQual_scikit_ordenc'] = oe.fit_transform(df2[['KitchenQual']])
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['KitchenQual_scikit_ordenc'] = oe.fit_transform(df2[['KitchenQual']])


Unnamed: 0,KitchenQual,BldgType,BldgType_encoded,KitchenQual_ord_enc,KitchenQual_scikit_ordenc
0,Gd,1Fam,0,3,2.0
1,TA,1Fam,0,2,3.0
2,Gd,1Fam,0,3,2.0
3,Gd,1Fam,0,3,2.0
4,Gd,1Fam,0,3,2.0
