- Machine learning algorithms tend to perform better or converge faster when the different features (variables) are on a smaller scale.
- Therefore it is common practice to normalize the data before training machine learning models on it.

# Using normalize() from sklearn
- We can see that all the values are now between the range 0 to 1. This is how the normalize() method under sklearn works.
- this is row wise

In [1]:
from sklearn import preprocessing
import numpy as np
x_array = np.array([2,3,5,6,7,4,8,7,6])
normalized_arr = preprocessing.normalize([x_array])
print(normalized_arr)

[[0.11785113 0.1767767  0.29462783 0.35355339 0.41247896 0.23570226
  0.47140452 0.41247896 0.35355339]]


- We can also normalize columns in a dataset using this method

In [None]:
import pandas as pd
housing = pd.read_csv("/content/sample_data/california_housing_train.csv")
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


we need to pick a column and convert it into an array. We are going to use the 'total_bedrooms’ column.

In [None]:
from sklearn import preprocessing
x_array = np.array(housing['total_bedrooms'])
normalized_arr = preprocessing.normalize([x_array])
print(normalized_arr)

[[0.01437454 0.02129852 0.00194947 ... 0.00594924 0.00618453 0.00336115]]


Using MinMaxScaler() to Normalize Data in Python

In [2]:
from sklearn import preprocessing
import pandas as pd
housing = pd.read_csv("/content/sample_data/california_housing_train.csv")
scaler = preprocessing.MinMaxScaler()
names = housing.columns
d = scaler.fit_transform(housing)
scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,1.0,0.175345,0.27451,0.147885,0.198945,0.028364,0.077454,0.06853,0.107012
1,0.984064,0.197662,0.352941,0.201608,0.294848,0.031559,0.075974,0.09104,0.134228
2,0.9751,0.12221,0.313725,0.018927,0.026847,0.009249,0.019076,0.079378,0.145775
3,0.974104,0.116897,0.254902,0.039515,0.052142,0.01435,0.037,0.185639,0.120414
4,0.974104,0.109458,0.372549,0.038276,0.050435,0.017405,0.042921,0.098281,0.104125


**MinMaxScaler** also gives you the option to select feature range. By default, the range is set to (0,1). Let’s see how to change the range to (0,2).

In [None]:
from sklearn import preprocessing
import pandas as pd
housing = pd.read_csv("/content/sample_data/california_housing_train.csv")
scaler = preprocessing.MinMaxScaler(feature_range=(0, 2))
names = housing.columns
d = scaler.fit_transform(housing)
scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,2.0,0.350691,0.54902,0.295769,0.39789,0.056728,0.154909,0.13706,0.214024
1,1.968127,0.395324,0.705882,0.403216,0.589696,0.063118,0.151949,0.18208,0.268457
2,1.950199,0.244421,0.627451,0.037854,0.053693,0.018498,0.038152,0.158756,0.291549
3,1.948207,0.233794,0.509804,0.07903,0.104283,0.0287,0.074001,0.371278,0.240828
4,1.948207,0.218916,0.745098,0.076552,0.100869,0.03481,0.085841,0.196563,0.208251


Data Reduction

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

In [None]:
housing = pd.read_csv("/content/sample_data/california_housing_train.csv")
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [None]:
x=housing.values
x

array([[-1.1431e+02,  3.4190e+01,  1.5000e+01, ...,  4.7200e+02,
         1.4936e+00,  6.6900e+04],
       [-1.1447e+02,  3.4400e+01,  1.9000e+01, ...,  4.6300e+02,
         1.8200e+00,  8.0100e+04],
       [-1.1456e+02,  3.3690e+01,  1.7000e+01, ...,  1.1700e+02,
         1.6509e+00,  8.5700e+04],
       ...,
       [-1.2430e+02,  4.1840e+01,  1.7000e+01, ...,  4.5600e+02,
         3.0313e+00,  1.0360e+05],
       [-1.2430e+02,  4.1800e+01,  1.9000e+01, ...,  4.7800e+02,
         1.9797e+00,  8.5800e+04],
       [-1.2435e+02,  4.0540e+01,  5.2000e+01, ...,  2.7000e+02,
         3.0147e+00,  9.4600e+04]])

In [None]:
x=scale(x)
x

array([[ 2.619365  , -0.67152023, -1.07967114, ..., -0.07599796,
        -1.25254316, -1.21055773],
       [ 2.53956878, -0.57326437, -0.76187201, ..., -0.09940441,
        -1.08148298, -1.09674535],
       [ 2.4946834 , -0.90546278, -0.92077158, ..., -0.99925206,
        -1.17010515, -1.04846131],
       ...,
       [-2.36291168,  2.90780067, -0.92077158, ..., -0.11760942,
        -0.44666313, -0.89412482],
       [-2.36291168,  2.88908527, -0.76187201, ..., -0.06039367,
        -0.99778717, -1.04759909],
       [-2.387848  ,  2.29955006,  1.85997083, ..., -0.60134255,
        -0.45536288, -0.97172417]])

In [None]:
pca=PCA()
pca.fit(x)
print(pca.explained_variance_ratio_)

[0.43517365 0.2136394  0.18906653 0.10105355 0.03226944 0.01540065
 0.00691552 0.0049036  0.00157766]


In [None]:
pca5=PCA(n_components=5)
pca5.fit(x)
print(pca5.explained_variance_ratio_)

[0.43517365 0.2136394  0.18906653 0.10105355 0.03226944]


In [None]:
print(pca5.components_)

NameError: ignored

# Data Integration

In [None]:
import pandas as pd
import numpy as np
# dataset1="/content/student.csv"
# dataset2="/content/mark.csv"
df1 = pd.read_csv("student.csv",header=0)
df2 = pd.read_csv("mark.csv",header=0)

FileNotFoundError: ignored

**The student dataset contains columns such as Age, Gender, Grade, and Employed.**

In [None]:
df1

Unnamed: 0,Student_id,Age,Gender,Grade,Employed
0,1,19,Male,1st Class,yes
1,2,20,Female,2nd Class,no
2,3,18,Male,1st Class,no
3,4,21,Female,2nd Class,no
4,5,19,Male,1st Class,no
...,...,...,...,...,...
227,228,21,Female,1st Class,no
228,229,20,Male,2nd Class,no
229,230,20,Male,3rd Class,yes
230,231,19,Female,1st Class,yes


**The marks.csv dataset contains columns such as Mark and City. **

In [None]:
df2

Unnamed: 0,Student_id,Mark,City
0,1,95,Chennai
1,2,70,Delhi
2,3,98,Mumbai
3,4,75,Pune
4,5,89,Kochi
...,...,...,...
227,228,99,Pune
228,229,70,Chennai
229,230,55,Delhi
230,231,97,Mumbai


**The Student_id column is common between the two datasets.**

Perform data integration on both the DataFrames with respect to the Student_id column using the pd.merge() function,

In [None]:
df=pd.merge(df1,df2,on='Student_id')

In [None]:
df

Unnamed: 0,Student_id,Age,Gender,Grade,Employed,Mark,City
0,1,19,Male,1st Class,yes,95,Chennai
1,2,20,Female,2nd Class,no,70,Delhi
2,3,18,Male,1st Class,no,98,Mumbai
3,4,21,Female,2nd Class,no,75,Pune
4,5,19,Male,1st Class,no,89,Kochi
...,...,...,...,...,...,...,...
227,228,21,Female,1st Class,no,99,Pune
228,229,20,Male,2nd Class,no,70,Chennai
229,230,20,Male,3rd Class,yes,55,Delhi
230,231,19,Female,1st Class,yes,97,Mumbai
