# Data Preprocessing Part I

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Features Scaling

In [4]:
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [5]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


### Note that all the features are of different scale, weight is in thousands and horsepower is around 100

### Some model may not be smart enough and mistakenly assume weight is a more important factor simply because they are huge number and therefore a small change in weight may give arise to a bigger impact in the target variable

### An example of such a model would be in K-Means clustering where we calculate a Euclidean distance between two data point to capture how similar the two data points are. In the above example, the similarity meansure will be dominate by the weights, other attributes such as the number of cylinders will have almost no impact.

### One may want to "standardize" all features to be around the same magnitude. To do that we use the StandardScalar

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [7]:
numerical_features = df[["cylinders","displacement","horsepower","weight", "acceleration"]]
numerical_features.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
0,8,307.0,130.0,3504,12.0
1,8,350.0,165.0,3693,11.5
2,8,318.0,150.0,3436,11.0
3,8,304.0,150.0,3433,12.0
4,8,302.0,140.0,3449,10.5


In [8]:
scaler = StandardScaler()
normalized_df = scaler.fit_transform(numerical_features)
normalized_df = pd.DataFrame(normalized_df, index = df.index, columns = ["cylinders","displacement","horsepower","weight", "acceleration"] )
normalized_df.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
0,1.498191,1.090604,0.664133,0.63087,-1.295498
1,1.498191,1.503514,1.574594,0.854333,-1.477038
2,1.498191,1.196232,1.184397,0.55047,-1.658577
3,1.498191,1.061796,1.184397,0.546923,-1.295498
4,1.498191,1.042591,0.924265,0.565841,-1.840117


In [9]:
normalized_df.describe()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
count,398.0,398.0,392.0,398.0,398.0
mean,-5.171742e-16,-8.312725000000001e-17,-4.392745e-16,-9.902743000000001e-17,8.982206000000001e-17
std,1.001259,1.001259,1.001278,1.001259,1.001259
min,-1.444949,-1.204411,-1.520975,-1.604943,-2.747814
25%,-0.8563206,-0.8563178,-0.7665929,-0.8828266,-0.6328794
50%,-0.8563206,-0.431404,-0.2853488,-0.1973624,-0.02472221
75%,1.498191,0.6584879,0.56008,0.7538337,0.5834349
max,1.498191,2.511784,3.265452,2.565185,3.351912


### Now all features has mean of 0 and standard deviation of 1

In [10]:
minMaxScaler = MinMaxScaler()
minMax_df = minMaxScaler.fit_transform(numerical_features)
minMax_df = pd.DataFrame(minMax_df, columns = ["cylinders","displacement","horsepower","weight", "acceleration"] )
minMax_df.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
0,1.0,0.617571,0.456522,0.53615,0.238095
1,1.0,0.728682,0.646739,0.589736,0.208333
2,1.0,0.645995,0.565217,0.51687,0.178571
3,1.0,0.609819,0.565217,0.516019,0.238095
4,1.0,0.604651,0.51087,0.520556,0.14881


In [49]:
minMax_df.describe()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
count,398.0,398.0,392.0,398.0,398.0
mean,0.490955,0.324098,0.317768,0.384867,0.450482
std,0.340201,0.269431,0.209191,0.240103,0.164148
min,0.0,0.0,0.0,0.0,0.0
25%,0.2,0.093669,0.157609,0.173164,0.346726
50%,0.2,0.20801,0.258152,0.337539,0.446429
75%,1.0,0.501292,0.434783,0.565637,0.546131
max,1.0,1.0,1.0,1.0,1.0


## Categorical Variables

In [50]:
df = pd.read_csv("data/Salaries.csv")

In [52]:
df.head(8)

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500
5,AssocProf,B,6,6,Male,97000
6,Prof,B,30,23,Male,175000
7,Prof,B,45,45,Male,147765


In [53]:
df["rank"].unique()

array(['Prof', 'AsstProf', 'AssocProf'], dtype=object)

In [54]:
df["sex"].unique()

array(['Male', 'Female'], dtype=object)

### Ordinal Encoding

In [55]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()

In [56]:
result = encoder.fit_transform(df[['rank']])
result

array([[2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [1.],
       [1.],
       [2.],
       [2.],
       [1.],
       [2.],
       [1.],
       [1.],
       [1.],
       [2.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [1.],
       [2.],
       [0.],
       [0.],
       [2.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],

In [59]:
encoder.categories_

[array(['AssocProf', 'AsstProf', 'Prof'], dtype=object)]

In [58]:
df['numerical_rank'] = result
df.head(8)

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary,numerical_rank
0,Prof,B,19,18,Male,139750,2.0
1,Prof,B,20,16,Male,173200,2.0
2,AsstProf,B,4,3,Male,79750,1.0
3,Prof,B,45,39,Male,115000,2.0
4,Prof,B,40,41,Male,141500,2.0
5,AssocProf,B,6,6,Male,97000,0.0
6,Prof,B,30,23,Male,175000,2.0
7,Prof,B,45,45,Male,147765,2.0


### One-hot Encoding

In [28]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
cat_1hot = cat_encoder.fit_transform(df[['rank']])

In [29]:
cat_1hot.toarray()

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [30]:
cat_encoder.categories_

[array(['AssocProf', 'AsstProf', 'Prof'], dtype=object)]

In [33]:
cat_1hot_df = pd.DataFrame(cat_1hot.toarray(), columns = cat_encoder.categories_)
cat_1hot_df.head()

Unnamed: 0,AssocProf,AsstProf,Prof
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [63]:
combined_df = pd.concat([df, cat_1hot_df], axis=1)
combined_df.head(8)

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary,numerical_rank,"(AssocProf,)","(AsstProf,)","(Prof,)"
0,Prof,B,19,18,Male,139750,2.0,0.0,0.0,1.0
1,Prof,B,20,16,Male,173200,2.0,0.0,0.0,1.0
2,AsstProf,B,4,3,Male,79750,1.0,0.0,1.0,0.0
3,Prof,B,45,39,Male,115000,2.0,0.0,0.0,1.0
4,Prof,B,40,41,Male,141500,2.0,0.0,0.0,1.0
5,AssocProf,B,6,6,Male,97000,0.0,1.0,0.0,0.0
6,Prof,B,30,23,Male,175000,2.0,0.0,0.0,1.0
7,Prof,B,45,45,Male,147765,2.0,0.0,0.0,1.0


### Dummy Variables

In [62]:
dummy = pd.get_dummies(df['rank'])
dummy.head(8)

Unnamed: 0,AssocProf,AsstProf,Prof
0,0,0,1
1,0,0,1
2,0,1,0
3,0,0,1
4,0,0,1
5,1,0,0
6,0,0,1
7,0,0,1


In [64]:
type(dummy)

pandas.core.frame.DataFrame

In [66]:
dummy = pd.get_dummies(df[['sex','rank']])
dummy.head(8)

Unnamed: 0,sex_Female,sex_Male,rank_AssocProf,rank_AsstProf,rank_Prof
0,0,1,0,0,1
1,0,1,0,0,1
2,0,1,0,1,0
3,0,1,0,0,1
4,0,1,0,0,1
5,0,1,1,0,0
6,0,1,0,0,1
7,0,1,0,0,1
