In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('housing.csv')

In [3]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [4]:
cols = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']


In [5]:
df = data[cols]

In [6]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [7]:
# fill missing values with zeroes
df = df.fillna(0)

In [8]:
df.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [9]:
# Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
df['rooms_per_household'] = df.total_rooms / df.households

In [10]:
# Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms

In [11]:
#Create a new column population_per_household by dividing the column population by the column households from dataframe.
df['population_per_household'] = df.population / df.households

In [12]:
# Question 1
# What is the most frequent observation (mode) for the column ocean_proximity?
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

the answer to number 1 is <1H OCEAN

In [13]:
# Question 2

-Split your data in train/val/test sets, with 60%/20%/20% distribution.

-Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

-Make sure that the target value (median_house_value) is not in your dataframe.

train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
# now splitting the train to a train and a val
df_train, df_val = train_test_split(df_full_train, test_size=.2/.8, random_state=42)

In [17]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [18]:
# getting the targets for the three sets
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values


In [19]:
# deleting the targets from the xs
del(df_train['median_house_value'])
del(df_val['median_house_value'])
del(df_test['median_house_value'])

In [20]:
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.924664,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,-0.14416,0.106389,-0.104112,0.002366
longitude,-0.924664,1.0,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.045967,-0.02754,0.084836,0.002476
housing_median_age,0.011173,-0.108197,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,0.105623,-0.153277,0.125396,0.013191
total_rooms,-0.0361,0.044568,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.134153,0.133798,-0.174583,-0.024581
total_bedrooms,-0.065318,0.068082,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.049148,0.002717,0.122205,-0.028019
population,-0.108785,0.099773,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.02465,-0.072213,0.031397,0.069863
households,-0.071035,0.05531,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,0.065843,-0.080598,0.059818,-0.027309
median_income,-0.079809,-0.015176,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.688075,0.326895,-0.573836,0.018766
median_house_value,-0.14416,-0.045967,0.105623,0.134153,0.049148,-0.02465,0.065843,0.688075,1.0,0.151948,-0.238759,-0.023737
rooms_per_household,0.106389,-0.02754,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,0.151948,1.0,-0.387465,-0.004852


In [21]:
corr_matrix[corr_matrix != 1].max()

latitude                    0.106389
longitude                   0.099773
housing_median_age          0.125396
total_rooms                 0.920196
total_bedrooms              0.966507
population                  0.907222
households                  0.966507
median_income               0.688075
median_house_value          0.688075
rooms_per_household         0.326895
bedrooms_per_room           0.125396
population_per_household    0.069863
dtype: float64

the answer to question 2 is total_bedrooms and households

Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [22]:
above_average = (y_train >= y_train.mean()).astype('int')

In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
round(mutual_info_score(df_train.ocean_proximity, above_average), 2)

0.1

The answer to question 3 is 0.10

In [25]:
# Question 4
from sklearn.feature_extraction import DictVectorizer

In [26]:
train_dicts = df_train.to_dict(orient='records')

In [27]:
dv = DictVectorizer(sparse=False)

In [28]:
dv.fit(train_dicts)

DictVectorizer(sparse=False)

In [29]:
X_train = dv.fit_transform(train_dicts)

In [30]:
val_dicts = df_val.to_dict(orient='records')

In [31]:
X_val = dv.transform(val_dicts)

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [35]:
y_train

array([241400., 500001.,  64100., ..., 215300., 139000., 181300.])

In [36]:
model.fit(X_train, above_average)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [37]:
y_pred = model.predict_proba(X_val)[:, 1]

In [38]:
y_pred = (y_pred >= 0.5).astype('int')

In [42]:
above_average_val = (y_val >= y_train.mean()).astype('int')

In [43]:
large_accuracy = (above_average_val == y_pred).mean()
large_accuracy

0.8372093023255814

In [44]:
# Question 5

In [48]:
# I guess I should have done this earlier but I"m just going to convert the ys to above average featrues now
# now gonna apply it to the three sets
y_train = (y_train >= y_train.mean()).astype('int')
y_val = (y_val >= y_train.mean()).astype('int')
y_test = (y_test >= y_train.mean()).astype('int')

In [46]:
features = ['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'rooms_per_household', 'bedrooms_per_room',
       'population_per_household']

In [50]:
original_accuracy = 0.8372093023255814

In [51]:
for feature in features:
    small_features = [f for f in features if f != feature]
    df_small_train = df_train[small_features]
    y_small_train = y_train
    df_small_val = df_val[small_features]
    y_small_val = y_val
    
    dv = DictVectorizer(sparse=False)
    dicts_train_small = df_small_train.to_dict(orient='records')
    X_small_train = dv.fit_transform(dicts_train_small)
    val_small_dicts = df_small_val.to_dict(orient='records')
    X_small_val = dv.transform(val_small_dicts)
    
    # initialize the model and fit it to the data
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_small_train, y_small_train)
    
    #apply the model to get the predictions
    y_small_pred = model.predict_proba(X_small_val)[:, 1]
    y_small_pred = (y_small_pred >= 0.5).astype('int')
    
    accuracy = (y_small_val == y_small_pred).mean()
    
    print(f'Model: without {feature}')
    print(f'Accuracy: {accuracy}')
    print(f'Diff from original accuracy: {round(original_accuracy - accuracy, 7)}')
    print()
    print()

Model: without latitude
Accuracy: 0.4069767441860465
Diff from original accuracy: 0.4302326


Model: without longitude
Accuracy: 0.4050387596899225
Diff from original accuracy: 0.4321705


Model: without housing_median_age
Accuracy: 0.4016472868217054
Diff from original accuracy: 0.435562


Model: without total_rooms
Accuracy: 0.4069767441860465
Diff from original accuracy: 0.4302326


Model: without total_bedrooms
Accuracy: 0.40625
Diff from original accuracy: 0.4309593


Model: without population
Accuracy: 0.4006782945736434
Diff from original accuracy: 0.436531


Model: without households
Accuracy: 0.40406976744186046
Diff from original accuracy: 0.4331395


Model: without median_income
Accuracy: 0.44597868217054265
Diff from original accuracy: 0.3912306


Model: without ocean_proximity
Accuracy: 0.3798449612403101
Diff from original accuracy: 0.4573643


Model: without rooms_per_household
Accuracy: 0.407218992248062
Diff from original accuracy: 0.4299903


Model: without bedrooms_p

answer to number 5 is total_rooms

In [52]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


In [53]:
log_price = np.log1p(df.median_house_value)

In [58]:
df['log_price'] = log_price

In [59]:
# I'm recreating all these variables to get back to an earier state.  but keep all the stuff above the same
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [60]:
df_train, df_val = train_test_split(df_full_train, test_size=.2/.8, random_state=42)

In [61]:
df_train

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,log_price
17244,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,241400.0,<1H OCEAN,3.922460,0.259714,3.754011,12.394215
8817,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,500001.0,NEAR OCEAN,7.564516,0.130228,2.789082,13.122367
19686,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,64100.0,INLAND,3.908012,0.234624,2.540059,11.068215
3545,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,277200.0,<1H OCEAN,5.201093,0.194158,2.059016,12.532498
17019,37.52,-122.30,38.0,2769.0,387.0,994.0,395.0,5.5902,417000.0,NEAR OCEAN,7.010127,0.139762,2.516456,12.940844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,33.79,-118.29,16.0,1867.0,571.0,951.0,498.0,3.3427,154200.0,<1H OCEAN,3.748996,0.305838,1.909639,11.946012
16339,38.04,-121.34,16.0,3295.0,565.0,2279.0,576.0,3.6083,146400.0,INLAND,5.720486,0.171472,3.956597,11.894105
14965,32.74,-116.99,18.0,3341.0,611.0,1952.0,602.0,3.9844,215300.0,<1H OCEAN,5.549834,0.182879,3.242525,12.279792
11117,33.84,-117.87,16.0,1545.0,354.0,730.0,350.0,4.5112,139000.0,<1H OCEAN,4.414286,0.229126,2.085714,11.842236


In [62]:
y_train = df_train.log_price.values
y_val = df_val.log_price.values
y_test = df_test.log_price.values


In [63]:
# deleting the targets from the xs
del(df_train['median_house_value'])
del(df_val['median_house_value'])
del(df_test['median_house_value'])

In [64]:
# deleting the targets from the xs
del(df_train['log_price'])
del(df_val['log_price'])
del(df_test['log_price'])

In [65]:
from sklearn.linear_model import Ridge

In [67]:
model = Ridge(alpha=0, solver="sag", random_state=42)

In [68]:
train_dicts = df_train.to_dict(orient='records')

In [69]:
dv = DictVectorizer(sparse=False)

In [70]:
dv.fit(train_dicts)

DictVectorizer(sparse=False)

In [71]:
X_train = dv.fit_transform(train_dicts)

In [72]:
val_dicts = df_val.to_dict(orient='records')

In [73]:
X_val = dv.transform(val_dicts)

In [74]:
model.fit(X_train, y_train)

Ridge(alpha=0, random_state=42, solver='sag')

In [76]:
y_pred = model.predict(X_val)

In [77]:
from sklearn.metrics import mean_squared_error

In [78]:
rmse = mean_squared_error(y_pred, y_val, squared=False)

In [79]:
rmse

0.524063570701514

In [81]:
for a in [0, 0.01, 0.1, 1, 10]:
    train_dicts = df_train.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dicts)
    
    X_train = dv.fit_transform(train_dicts)
    
    val_dicts = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    model = model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = round(mean_squared_error(y_pred, y_val, squared=False), 3)
    
    print(f'alpha: {a}    rmse:{rmse}')
    print()


alpha: 0    rmse:0.524

alpha: 0.01    rmse:0.524

alpha: 0.1    rmse:0.524

alpha: 1    rmse:0.524

alpha: 10    rmse:0.524



The answer to question 6 is that they were all the same (at least rounded to 3 places) so we pick the smalled, which is 0.