In [5]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

In [6]:
df = pd.read_csv("housing.csv")
print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Custom Transformers

For information about Duck Typing, click [here](https://www.geeksforgeeks.org/duck-typing-in-python/).

The following cell has a lot going on - let's break it down here:

We begin by importing some base classes, from which our CombinedAttributesAdder Class will inherit from.  Next, we specify the column indices of the total_rooms, total_bedrooms, population, and households features in the housing dataframe.  After that, we define the class, which inherits from the two base classes.  The [\_\_init\_\_](https://stackoverflow.com/questions/625083/what-init-and-self-do-in-python) method serves as the class constructors and allows us to instantiate objects of the class.  There is only one class attribute (or data member) defined, but remember that others could be inherited from the base classes!  The fit method does nothing, but it must exist for use by Pipelines (discussed below).  The transform method creates our new features and returns them.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

In [None]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)

housing_extra_attribs.head()

## Scale the Data
### 2 ways to scale Data:

### *** DO NOT SCALE Y_Train & Y_TEST, 

##### **** scale only X_train, X_test & X_validation 

## Fit only on X_train
    * scaler = StandardScaler()
    * scaler.fit(X_train)
    * X_train = scaler.transform(X_train)
    * X_eval = scaler.transform(X_validation)
    * X_test = scaler.transform(X_test)

#### 1. The [StandardScalar](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) class is used to standardize (or normalize) features by removing the mean and scaling to unit variance.
    -- Will always have a mean of 0 & variance unit of 1 aka standard deviation
    -- No bound limit like min_max which might be a problem for some algorithms such as neural networks because they expect values to be bound between 0 to 1
    -- Much less affected by outliers
    -- calculated as Z = (x - u_mean) / standard deviation![image-2.png](attachment:image-2.png)



#### 2. The MinMax Scalar is used to normalize the features between 0 & 1
*** Outliers will affect as formula used is
    -- Values are shifted & rescaled so that they end up raning from 0 to 1
    -- Sometimes the range might be -1 to 1
    -- calculated as Z = ( x - x_min) / (x_max - x_min)
    --sckit-learn -> min_max which will give you option to change range to desired amounts outside of the 0 to 1

## Transformation Pipelines

#### 1. Pipeline -> Apply transformation to numerical or categorial in a sequence 
#### 2. ColumnTransformert -> Apply the Pipleline created in step 1 to both Numerical & Categorical data

Sci-kit Learn's [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) class allows us to sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit.


##  Pipeline -> Apply transformation to numerical or categorial in a sequence

In [8]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

num_pipeline = Pipeline([ 
    ('SimpleImputer', SimpleImputer(strategy = 'median')),
    ('StandardScaler ', StandardScaler()),
    ('MinMax AKA Normalization', MinMaxScaler())
    
])

In [10]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [11]:
df_numeric = df.drop('ocean_proximity', axis=1)
df_numeric.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [12]:
housing_num_transformed = num_pipeline.fit_transform(df_numeric)
housing_num_transformed

array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.97703285,
         2.34476576,  2.12963148],
       [-1.32284391,  1.04318455, -0.60701891, ...,  1.66996103,
         2.33223796,  1.31415614],
       [-1.33282653,  1.03850269,  1.85618152, ..., -0.84363692,
         1.7826994 ,  1.25869341],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.17404163,
        -1.14259331, -0.99274649],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.39375258,
        -1.05458292, -1.05860847],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.07967221,
        -0.78012947, -1.01787803]])

## ColumnTransformer -> Apply the Pipleline created in step 1 to both Numerical & Categorical data

Let's bring it all together now by adding the scaled numerical featuers to the one-hot vectors we obtained previously through the use of the [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html) class, which transforms only columns of a DataFrame. 

 ##### Steps:
 
         1. Get the column names for Numerical & Categorical features
         2. Use the ColumnTransformer to apply the particular transformations for each type of feature
         3. Fit Transform the ColumnTransformer created

In [14]:
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']


### 1. Get the column names for Numerical & Categorical features

In [None]:
num_attribs = list(df_numeric)
cat_attribs = ["ocean_proximity"]

print(num_attribs)

### 2. Use the ColumnTransformer to apply the particular transformations for each type of feature

In [None]:

full_pipeline = ColumnTransformer([ 
    ('Numerical transformers', num_pipeline, num_attribs),
    ('Categorical transforerms', OneHotEncoder(), cat_attribs)
    
        ])

### 3. Fit Transform the ColumnTransformer created

In [15]:
housing_prepared = full_pipeline.fit_transform(df)
housing_prepared

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])

In [16]:
df.shape

(20640, 10)

In [17]:
housing_prepared.shape

(20640, 14)

In [28]:
col_names = list(df.columns) 
col_names.append('ocean_proximity1')
col_names.append('ocean_proximity2')
col_names.append('ocean_proximity3')
col_names.append('ocean_proximity4')

In [29]:
col_names

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity',
 'ocean_proximity1',
 'ocean_proximity2',
 'ocean_proximity3',
 'ocean_proximity4']

In [24]:
df.index 

RangeIndex(start=0, stop=20640, step=1)

In [30]:
housing_transformed = pd.DataFrame(housing_prepared, columns=col_names, index=df.index) 
housing_transformed.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,ocean_proximity1,ocean_proximity2,ocean_proximity3,ocean_proximity4
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,0.0,0.0,0.0,1.0,0.0


### Applying just the MinMax scaler to see the difference between above which is the StandardScalre

In [32]:
minmax_num_pipeline = Pipeline([ 
    ('MinMax AKA Normalization', MinMaxScaler()) 
])

In [33]:
min_max_transformed = minmax_num_pipeline.fit_transform(df_numeric)
min_max_transformed

array([[0.21115538, 0.5674814 , 0.78431373, ..., 0.02055583, 0.53966842,
        0.90226638],
       [0.21215139, 0.565356  , 0.39215686, ..., 0.18697583, 0.53802706,
        0.70824656],
       [0.21015936, 0.5642933 , 1.        , ..., 0.02894261, 0.46602805,
        0.69505074],
       ...,
       [0.31175299, 0.73219979, 0.31372549, ..., 0.07104095, 0.08276438,
        0.15938285],
       [0.30179283, 0.73219979, 0.33333333, ..., 0.05722743, 0.09429525,
        0.14371281],
       [0.30976096, 0.72582359, 0.29411765, ..., 0.08699227, 0.13025338,
        0.15340349]])

In [34]:
min_max_df = pd.DataFrame(min_max_transformed, columns=df_numeric.columns, index=df.index) 
min_max_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,0.708247
2,0.210159,0.564293,1.0,0.03726,0.02933,0.013818,0.028943,0.466028,0.695051
3,0.209163,0.564293,1.0,0.032352,0.036313,0.015555,0.035849,0.354699,0.672783
4,0.209163,0.564293,1.0,0.04133,0.043296,0.015752,0.042427,0.230776,0.674638
