In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
df = pd.read_csv('Data/housing.csv')

In [3]:
df['income_cat'] = pd.cut(df['median_income'], bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY,5
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY,5
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY,5
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY,4
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY,3


In [5]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
income_cat              0
dtype: int64

In [6]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['income_cat']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [7]:
# strat_train_set

In [8]:
# strat_test_set

In [9]:
for sett in (strat_train_set, strat_test_set):
    sett.drop('income_cat', axis=1, inplace=True)

In [10]:
# strat_train_set

In [42]:
housing = strat_train_set.drop('median_house_value', axis=1)  # Input/Features
housing_labels = strat_train_set['median_house_value'].copy() # Labels/Target

In [43]:
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

In [13]:
(strat_train_set.isnull().sum()/strat_train_set.shape[0])*100

longitude             0.00000
latitude              0.00000
housing_median_age    0.00000
total_rooms           0.00000
total_bedrooms        0.95688
population            0.00000
households            0.00000
median_income         0.00000
median_house_value    0.00000
ocean_proximity       0.00000
dtype: float64

In [14]:
strat_train_set.shape

(16512, 10)

In [44]:
imputer = SimpleImputer(strategy='median')
housing_num = housing.select_dtypes(include=np.number)
# print(housing_num)
imputer.fit(housing_num)

SimpleImputer(strategy='median')

In [16]:
imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [45]:
print(housing_num.columns)
print(housing_num.index)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')
Int64Index([17606, 18632, 14650,  3230,  3555, 19480,  8879, 13685,  4937,
             4861,
            ...
            15270,  3754, 12166,  6003,  7364,  6563, 12053, 13908, 11159,
            15775],
           dtype='int64', length=16512)


In [39]:
X = imputer.transform(housing_num)
X

array([[-121.89  ,   37.29  ,   38.    , ...,  710.    ,  339.    ,
           2.7042],
       [-121.93  ,   37.05  ,   14.    , ...,  306.    ,  113.    ,
           6.4214],
       [-117.2   ,   32.77  ,   31.    , ...,  936.    ,  462.    ,
           2.8621],
       ...,
       [-116.4   ,   34.09  ,    9.    , ..., 2098.    ,  765.    ,
           3.2723],
       [-118.01  ,   33.82  ,   31.    , ..., 1356.    ,  356.    ,
           4.0625],
       [-122.45  ,   37.77  ,   52.    , ..., 1269.    ,  639.    ,
           3.575 ]])

In [66]:
housing = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)
housing.head()
housing['Ocean_Proximity'] = df['ocean_proximity']
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'Ocean_Proximity'],
      dtype='object')

In [62]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoded_arr = encoder.fit_transform(housing[['Ocean_Proximity']])
encoded_feature_names = encoder.get_feature_names(['Ocean_Proximity'])
encoded_df = pd.DataFrame(encoded_arr, columns=encoded_feature_names, index=housing.index)
encoded_df

Unnamed: 0,Ocean_Proximity_<1H OCEAN,Ocean_Proximity_INLAND,Ocean_Proximity_ISLAND,Ocean_Proximity_NEAR BAY,Ocean_Proximity_NEAR OCEAN
17606,1.0,0.0,0.0,0.0,0.0
18632,1.0,0.0,0.0,0.0,0.0
14650,0.0,0.0,0.0,0.0,1.0
3230,0.0,1.0,0.0,0.0,0.0
3555,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
6563,0.0,1.0,0.0,0.0,0.0
12053,0.0,1.0,0.0,0.0,0.0
13908,0.0,1.0,0.0,0.0,0.0
11159,1.0,0.0,0.0,0.0,0.0


In [20]:
# from sklearn.preprocessing import OrdinalEncoder
# ordinal_encoder = OrdinalEncoder()
# housing['Ocean_Proximity'] = ordinal_encoder.fit_transform(housing[['ocean_proximity']])

In [67]:
housing = pd.concat([housing.drop('Ocean_Proximity', axis=1), encoded_df], axis=1)
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,Ocean_Proximity_<1H OCEAN,Ocean_Proximity_INLAND,Ocean_Proximity_ISLAND,Ocean_Proximity_NEAR BAY,Ocean_Proximity_NEAR OCEAN
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,1.0,0.0,0.0,0.0,0.0
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,1.0,0.0,0.0,0.0,0.0
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,0.0,0.0,0.0,0.0,1.0
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,0.0,1.0,0.0,0.0,0.0
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,1.0,0.0,0.0,0.0,0.0


In [68]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,Ocean_Proximity_<1H OCEAN,Ocean_Proximity_INLAND,Ocean_Proximity_ISLAND,Ocean_Proximity_NEAR BAY,Ocean_Proximity_NEAR OCEAN
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,1.0,0.0,0.0,0.0,0.0
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,1.0,0.0,0.0,0.0,0.0
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,0.0,0.0,0.0,0.0,1.0
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,0.0,1.0,0.0,0.0,0.0
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,0.0,1.0,0.0,0.0,0.0
12053,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,0.0,1.0,0.0,0.0,0.0
13908,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,0.0,1.0,0.0,0.0,0.0
11159,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,1.0,0.0,0.0,0.0,0.0
