In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

## Data preparation

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df = df[['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']]

In [5]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
(df.isnull().sum())

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [7]:
df = df.fillna(0)

In [8]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [9]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


## Question 1

### 1.1 What is the most frequent observation (mode) for the column ocean_proximity?

In [10]:
df['ocean_proximity'].mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

### 1.2 Split the data

- Split your data in train/val/test sets, with 60%/20%/20% distribution.

- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

- Make sure that the target value (median_house_value) is not in your dataframe.


In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_full_train

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
15961,37.71,-122.43,52.0,1410.0,286.0,879.0,282.0,3.1908,255600.0,NEAR BAY,5.000000,0.202837,3.117021
1771,37.95,-122.35,42.0,1485.0,290.0,971.0,303.0,3.6094,114600.0,NEAR BAY,4.900990,0.195286,3.204620
16414,37.90,-121.24,16.0,50.0,10.0,20.0,6.0,2.6250,137500.0,INLAND,8.333333,0.200000,3.333333
5056,34.02,-118.35,34.0,5218.0,1576.0,3538.0,1371.0,1.5143,118800.0,<1H OCEAN,3.805981,0.302031,2.580598
8589,33.89,-118.39,38.0,1851.0,332.0,750.0,314.0,7.3356,422700.0,<1H OCEAN,5.894904,0.179363,2.388535
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,33.76,-117.88,17.0,1768.0,474.0,1079.0,436.0,1.7823,205300.0,<1H OCEAN,4.055046,0.268100,2.474771
17289,34.42,-119.63,42.0,1765.0,263.0,753.0,260.0,8.5608,500001.0,<1H OCEAN,6.788462,0.149008,2.896154
5192,33.93,-118.26,42.0,1433.0,295.0,775.0,293.0,1.1326,104800.0,<1H OCEAN,4.890785,0.205862,2.645051
12172,33.73,-117.16,10.0,2381.0,454.0,1323.0,477.0,2.6322,140700.0,INLAND,4.991614,0.190676,2.773585


In [12]:
df_train, df_val = train_test_split (df_full_train, test_size=0.25, random_state=1)

In [13]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values
y_full_train = df_full_train.median_house_value.values

In [14]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']
del df_full_train['median_house_value']