In [4]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

In [5]:
df = pd.read_csv("housing.csv")
print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Handling Text and Categorical Attributes

For df[[colname(s)]], the interior brackets are for list, and the outside brackets are indexing operator, i.e. you must use double brackets if you select two or more columns. With one column name, single pair of brackets returns a Series, while double brackets return a dataframe.

In [7]:
housing_cat = df[['ocean_proximity']]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
5,NEAR BAY
6,NEAR BAY
7,NEAR BAY
8,NEAR BAY
9,NEAR BAY


In [8]:
housing_cat.nunique()

ocean_proximity    5
dtype: int64

In [10]:
housing_cat.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

### Convert the text categorical variables to numerical data
#### 1. OrdinalEncoder - assigns numberical value keeping the order Ex. Great, Good, Bad, Worst
    ** is used to encode categorical features as a sequence of consequtive integers from 0 to n-1
    *** ML algorithms will assume that two nearby values are more similar than two distant values
#### 2. OneHotEncoder - assigns 1 to one category and  0 for all others values with no order --> AKA Dummy Attributes
    *** only one attribute will be equal to 1 (hot), while the others will be 0 (cold)
    
#### 3. pd.get_dummies( df.drop(' drop desired feature such as target' , axis = 1),drop_first = True)
*** similar to oneHotEncoder -> returns 1 and 0

The [OrdinalEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) class is used to encode categorical features as a sequence of consequtive integers from 0 to n-1.

In [26]:
df2 = df.copy()
df2.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [28]:
pd.get_dummies(df2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0


In [29]:
pd.get_dummies(df2, drop_first=True)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,1,0,0,0


## 1. OrdinalEncoder
        Steps:
         ###   1. Instinatiate the OrdinalEncoder 
                -> ordinal_encoder = OrdinalEncoder()
                
         ###   2. Fit & Transform the data -> Returns an array of the transformed data
                -> housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
             
####    2b. Convert the transformed data to a DataFrame
      -> housing_cat_tr = pd.DataFrame(housing_cat_encoded, columns=housing_cat.columns, index=housing_cat.index) 

             
         ###   3. Take a look at the categories from the encoder
                -> ordinal_encoder.categories_

In [12]:
from sklearn.preprocessing import OrdinalEncoder

# Instinatiate the OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [None]:
# Fit & Transform the data -> Returns an array of the transformed data
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [15]:
# Take a look at the categories
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

One issue with this representation is that ML algorithms will assume that two nearby values are more similar than two distant values. This may be fine in some cases (e.g., for ordered categories such as “bad,” “average,” “good,” and “excellent”), but it is obviously not the case for the ocean_proximity column (for example, categories 0 and 4 are clearly more similar than categories 0 and 1). To fix this issue, a common solution is to create one binary attribute per category: one attribute equal to 1 when the category is “<1H OCEAN” (and 0 otherwise), another attribute equal to 1 when the category is “INLAND” (and 0 otherwise), and so on. This is called one-hot encoding, because only one attribute will be equal to 1 (hot), while the others will be 0 (cold). The new attributes are sometimes called dummy attributes. Scikit-Learn provides a [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) class to convert categorical values into one-hot vectors:

In [None]:
print(housing_cat.head())
housing_cat_encoded[:5]

In [18]:
# --> convert back to DF
housing_cat_tr = pd.DataFrame(housing_cat_encoded, columns=housing_cat.columns, index=housing_cat.index) 
print(housing_cat_tr.columns)
print(housing_cat_tr.ocean_proximity.unique())
housing_cat_tr.head()

Index(['ocean_proximity'], dtype='object')
[3. 0. 1. 4. 2.]


Unnamed: 0,ocean_proximity
0,3.0
1,3.0
2,3.0
3,3.0
4,3.0


### 2. OneHotEncoder

## 1. OneHotEncoder
        Steps:
         ###   1. Instinatiate the OrdinalEncoder 
                -> cat_onehot = OneHotEncoder()
                    **By default it returns a sparse array -> returns all the 0 columns which can take up big memory space if the feature had a hundred categories**
                    
                   ## Solution to this is:
                      cat_encoder = OneHotEncoder(sparse=False)
                    a. sparse = False -> returns an array that saves only the NON-ZERO memory places
                    
                    b. After the fit_transform, convert it to sparse array by:
                       -> housing_cat_1hot.toarray()
                
         ###   2. Fit & Transform the data -> Returns an array of the transformed data
                -> housing_cat_1hot = cat_onehot.fit_transform(housing_cat)

             
####    2b. Convert the transformed data to a DataFrame
      -> housing_cat_tr = pd.DataFrame(housing_cat_encoded, columns=housing_cat.columns, index=housing_cat.index) 

             
         ###   3. Take a look at the categories from the encoder
                -> ordinal_encoder.categories_

In [19]:
from sklearn.preprocessing import OneHotEncoder

# Instinatiate the OrdinalEncoder
cat_onehot = OneHotEncoder()

# Fit & Transform the data -> Returns an array of the transformed data
housing_cat_1hot = cat_onehot.fit_transform(housing_cat)


housing_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [20]:
# After the fit_transform, convert it to sparse array by
housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

## Alternatively, you can set `sparse=False` when creating the `OneHotEncoder`:

In [21]:
cat_encoder2 = OneHotEncoder(sparse=False)
housing_cat_1hot2 = cat_encoder2.fit_transform(housing_cat)
housing_cat_1hot2

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [23]:
# Take a look at the categories from the encoder
cat_onehot.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [24]:
# Take a look at the categories from the encoder
cat_encoder2.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]