# Normalization of the datasets is very crucial for further statistical analysis

## Best practice is to use CIT to get a sample distribution of means


## However there are mainly three techniques to normalize a column


### 1. value/max value of that column
### 2. value - min(values of that column)/max-min 
### 3. mean(column)/column.std() -- Z score

In [1]:
#let us import a dataset and normalize the column

import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("../automobile_formatted.csv")

In [9]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,0,3,99.54,alfa-romero,gas,std,two,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,1,3,99.54,alfa-romero,gas,std,two,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,2,1,99.54,alfa-romero,gas,std,two,hatchback,rwd,front,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,3,2,164.0,audi,gas,std,four,sedan,fwd,front,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,4,2,164.0,audi,gas,std,four,sedan,4wd,front,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
5,5,2,99.54,audi,gas,std,two,sedan,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250
6,6,1,158.0,audi,gas,std,four,sedan,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
7,7,1,99.54,audi,gas,std,four,wagon,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,18920
8,8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
9,10,2,192.0,bmw,gas,std,two,sedan,rwd,front,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


# Let us normalize the prices



In [11]:
df["price"] = df["price"]/df["price"].max()

In [12]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,0,3,99.54,alfa-romero,gas,std,two,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,0.297247
1,1,3,99.54,alfa-romero,gas,std,two,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,0.363436
2,2,1,99.54,alfa-romero,gas,std,two,hatchback,rwd,front,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,0.363436
3,3,2,164.0,audi,gas,std,four,sedan,fwd,front,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,0.307269
4,4,2,164.0,audi,gas,std,four,sedan,4wd,front,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,0.384361
5,5,2,99.54,audi,gas,std,two,sedan,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,0.335903
6,6,1,158.0,audi,gas,std,four,sedan,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,0.390088
7,7,1,99.54,audi,gas,std,four,wagon,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,0.41674
8,8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,0.525881
9,10,2,192.0,bmw,gas,std,two,sedan,rwd,front,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,0.361894


# Similarly let us normalize the normalized-losses using min max method

In [14]:
df["normalized-losses"] = (df["normalized-losses"]-df["normalized-losses"].min())/(df["normalized-losses"].max()-df["normalized-losses"].min())

In [15]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,0,3,0.180838,alfa-romero,gas,std,two,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,0.297247
1,1,3,0.180838,alfa-romero,gas,std,two,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,0.363436
2,2,1,0.180838,alfa-romero,gas,std,two,hatchback,rwd,front,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,0.363436
3,3,2,0.518325,audi,gas,std,four,sedan,fwd,front,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,0.307269
4,4,2,0.518325,audi,gas,std,four,sedan,4wd,front,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,0.384361
5,5,2,0.180838,audi,gas,std,two,sedan,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,0.335903
6,6,1,0.486911,audi,gas,std,four,sedan,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,0.390088
7,7,1,0.180838,audi,gas,std,four,wagon,fwd,front,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,0.41674
8,8,1,0.486911,audi,gas,turbo,four,sedan,fwd,front,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,0.525881
9,10,2,0.664921,bmw,gas,std,two,sedan,rwd,front,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,0.361894


In [20]:
#drop unecessary column

df.drop(0)

Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
1,1,3,0.180838,alfa-romero,gas,std,two,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,0.363436
2,2,1,0.180838,alfa-romero,gas,std,two,hatchback,rwd,front,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,0.363436
3,3,2,0.518325,audi,gas,std,four,sedan,fwd,front,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,0.307269
4,4,2,0.518325,audi,gas,std,four,sedan,4wd,front,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,0.384361
5,5,2,0.180838,audi,gas,std,two,sedan,fwd,front,...,136,mpfi,3.19,3.40,8.5,110,5500,19,25,0.335903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,200,-1,0.157068,volvo,gas,std,four,sedan,rwd,front,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,0.371035
197,201,-1,0.157068,volvo,gas,turbo,four,sedan,rwd,front,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,0.419493
198,202,-1,0.157068,volvo,gas,std,four,sedan,rwd,front,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,0.473238
199,203,-1,0.157068,volvo,diesel,turbo,four,sedan,rwd,front,...,145,idi,3.01,3.40,23.0,106,4800,26,27,0.494934
