In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-09-18 22:55:10--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8000::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: 'housing.csv.2'

     0K .......... .......... .......... .......... ..........  3% 6.36M 0s
    50K .......... .......... .......... .......... ..........  7% 89.2M 0s
   100K .......... .......... .......... .......... .......... 10% 5.89M 0s
   150K .......... .......... .......... .......... .......... 14% 19.2M 0s
   200K .......... .......... .......... .......... .......... 17% 27.0M 0s
   250K .......... .......... .......... .......... .......... 21% 10.4M 0s
   300K .......... .......... .......... .......... .......... 25% 18.1M 0s
   350K ......

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('housing.csv')

In [3]:
df = df[['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']]

In [4]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


### Question 1

Find a feature with missing values. How many missing values does it have?

* 207
* 307
* 408
* 508


In [5]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

### Question 2

What's the median (50% percentile) for variable 'population'?

* 1133
* 1122
* 1166
* 1188


In [6]:
df['population'].median()

1166.0

Split the data

- Shuffle the initial dataset, use seed 42.
- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Make sure that the target value ('median_house_value') is not in your dataframe.
- Apply the log transformation to the median_house_value variable using the np.log1p() function.


In [7]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [8]:
y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

### Question 3

- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?


In [9]:
def linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [10]:
def rmse(y, y_pred):
    diff = y - y_pred
    mse = (diff * diff).mean()
    return np.sqrt(mse)

##### option 1: fill with 0

In [11]:
def fill_with_0(df):
    df = df.fillna(0)
    x = df.values
    return x

In [12]:
x_train = fill_with_0(df_train)
w_0, w = linear_regression(x_train, y_train)

In [13]:
y_pred = w_0 + x_train.dot(w)

In [14]:
rmse(y_train, y_pred)

0.3413135910156676

In [15]:
x_val = fill_with_0(df_val)
y_pred_2 = w_0 + x_val.dot(w)

rmse(y_val, y_pred_2)

0.32953303652313465

##### option 2: fill with median

In [16]:
def fill_with_median(df):
    df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].median())
    x = df.values
    return x

In [17]:
x_train_2 = fill_with_median(df_train)
w_0_2, w_2 = linear_regression(x_train_2, y_train)

In [18]:
y_pred_3 = w_0_2 + x_train_2.dot(w)

In [19]:
rmse(y_train, y_pred_3)

0.347292557348252

In [20]:
x_val_2 = fill_with_median(df_val)

w_0_3, w_3 = linear_regression(x_val_2, y_val)
y_pred_4 = w_0_3 + x_val_2.dot(w)

rmse(y_val, y_pred_4)

0.402732495252535

### Question 4

- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

If there are multiple options, select the smallest r.

Options:

- 0
- 0.000001
- 0.001
- 0.0001


In [56]:
def linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [57]:
def fill_with_0(df):
    df = df.fillna(0)
    x = df.values
    return x

In [61]:
x_train = fill_with_0(df_train)
x_val = fill_with_0(df_val)

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = linear_regression_reg(x_train, y_train, r=r)
    y_pred = w_0 + x_train.dot(w)
    print(f'regularizatbion: {r}, train RMSE: {np.round(rmse(y_train, y_pred),2)}')
    
    y_pred_2 = w_0 + x_val.dot(w)
    print(f'regularization: {r}, validation RMSE: {np.round(rmse(y_val, y_pred_2),2)}')

regularizatbion: 0, train RMSE: 0.34
regularization: 0, validation RMSE: 0.34
regularizatbion: 1e-06, train RMSE: 0.34
regularization: 1e-06, validation RMSE: 0.34
regularizatbion: 0.0001, train RMSE: 0.34
regularization: 0.0001, validation RMSE: 0.34
regularizatbion: 0.001, train RMSE: 0.34
regularization: 0.001, validation RMSE: 0.34
regularizatbion: 0.01, train RMSE: 0.34
regularization: 0.01, validation RMSE: 0.34
regularizatbion: 0.1, train RMSE: 0.34
regularization: 0.1, validation RMSE: 0.34
regularizatbion: 1, train RMSE: 0.34
regularization: 1, validation RMSE: 0.34
regularizatbion: 5, train RMSE: 0.35
regularization: 5, validation RMSE: 0.34
regularizatbion: 10, train RMSE: 0.35
regularization: 10, validation RMSE: 0.34


### Question 5

- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

- Note: Standard deviation shows how different the values are. If it's low, then all values are approximately the same. If it's high, the values are different. If standard deviation of scores is low, then our model is stable.


In [26]:
def fill_with_0(df):
    df = df.fillna(0)
    x = df.values
    return x

In [27]:
def linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [28]:
def rmse(y, y_pred):
    diff = y - y_pred
    mse = (diff * diff).mean()
    return np.sqrt(mse)

In [62]:
for seed_num in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed_num)

    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    
    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    x_train = fill_with_0(df_train)
    x_val = fill_with_0(df_val)
    w_0, w = linear_regression(x_train, y_train)
    y_pred = w_0 + x_train.dot(w)
    
    
    y_pred_2 = w_0 + x_val.dot(w)
    
    result = rmse(y_val, y_pred_2)
    std_result = np.std(result)
    round_off_std_result = np.round(std_result, 3)
    print(f'seed: {seed_num}, validation RMSE: {round_off_std_result}')

seed: 0, validation RMSE: 0.0
seed: 1, validation RMSE: 0.0
seed: 2, validation RMSE: 0.0
seed: 3, validation RMSE: 0.0
seed: 4, validation RMSE: 0.0
seed: 5, validation RMSE: 0.0
seed: 6, validation RMSE: 0.0
seed: 7, validation RMSE: 0.0
seed: 8, validation RMSE: 0.0
seed: 9, validation RMSE: 0.0


### Question 6

- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.
- What's the RMSE on the test dataset?

Options:

- 0.35
- 0.135
- 0.450
- 0.245


In [51]:
np.random.seed(9)

n = len(df)
n_test = int(0.2 * n)
n_train = n - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_test = df_shuffled.iloc[n_train:].copy()

y_train_orig = df_train.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(df_train.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_test['median_house_value']

In [52]:
def fill_with_0(df):
    df = df.fillna(0)
    x = df.values
    return x

In [53]:
def linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [54]:
x_train = fill_with_0(df_train)
w_0, w = linear_regression_reg(x_train, y_train, r = 0.001)
y_pred = w_0 + x_train.dot(w)

In [55]:
x_test = fill_with_0(df_test)
y_pred_2 = w_0 + x_test.dot(w)
print(f'regularization: {0.001}, validation RMSE: {np.round(rmse(y_test, y_pred_2),2)}')

regularization: 0.001, validation RMSE: 0.35
