In [1]:
import numpy as np
import pandas as pd

In [2]:
features = [
    'latitude', 'longitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income'
]
target = 'median_house_value'

df = pd.read_csv("housing.csv")

### Question 1

In [3]:
df.isna().sum()[df.isna().sum() > 0]

total_bedrooms    207
dtype: int64

### Question 2

In [4]:
df.population.median()

1166.0

### Question 3

In [5]:
def get_data_split(seed=42):
    np.random.seed(seed)
    shuffled_index = df.sample(frac=1).index

    nrows = df.shape[0]
    train_indices = shuffled_index[:int(nrows * 0.6)]
    val_indices = shuffled_index[int(nrows * 0.6): int(nrows * 0.8)]
    test_indices = shuffled_index[int(nrows * 0.8):]

    assert len(train_indices) + len(val_indices) + len(test_indices) == nrows
    assert len(set(train_indices).intersection(set(val_indices))) == 0
    assert len(set(train_indices).intersection(set(test_indices))) == 0
    assert len(set(val_indices).intersection(set(test_indices))) == 0

    X_train = df.loc[train_indices, features]
    y_train = np.log1p(df.loc[train_indices, target].values)

    X_val = df.loc[val_indices, features]
    y_val = np.log1p(df.loc[val_indices, target].values)

    X_test = df.loc[test_indices, features]
    y_test = np.log1p(df.loc[test_indices, target].values)

    return X_train, y_train, X_val, y_val, X_test, y_test

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [6]:
X_train, y_train, X_val, y_val, X_test, y_test = get_data_split()

In [7]:
fill_fnc = lambda x: x.fillna(0)
w_0, w = train_linear_regression(X_train.apply(fill_fnc), y_train)

y_pred = w_0 + X_val.apply(fill_fnc).dot(w)
round(rmse(y_val, y_pred), 2)

0.33

In [8]:
fill_fnc = lambda x: x.fillna(x.mean())
w_0, w = train_linear_regression(X_train.apply(fill_fnc), y_train)

y_pred = w_0 + X_val.apply(fill_fnc).dot(w)
round(rmse(y_val, y_pred), 2)

0.33

In [9]:
fill_fnc = lambda x: x.fillna(x.median())
w_0, w = train_linear_regression(X_train.apply(fill_fnc), y_train)

y_pred = w_0 + X_val.apply(fill_fnc).dot(w)
round(rmse(y_val, y_pred), 2)

0.33

### Question 4

In [10]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [11]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    fill_fnc = lambda x: x.fillna(0)
    w_0, w = train_linear_regression_reg(X_train.apply(fill_fnc), y_train, r=r)

    y_pred = w_0 + X_val.apply(fill_fnc).dot(w)
    score = round(rmse(y_val, y_pred), 2)

    print(f'r={r :<6}  RSME={score}')

r=0       RSME=0.33
r=1e-06   RSME=0.33
r=0.0001  RSME=0.33
r=0.001   RSME=0.33
r=0.01    RSME=0.33
r=0.1     RSME=0.33
r=1       RSME=0.33
r=5       RSME=0.34
r=10      RSME=0.34


### Question 5

In [12]:
scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    X_train, y_train, X_val, y_val, _, _ = get_data_split(seed=seed)

    fill_fnc = lambda x: x.fillna(0)
    w_0, w = train_linear_regression(X_train.apply(fill_fnc), y_train)

    y_pred = w_0 + X_val.apply(fill_fnc).dot(w)
    scores.append(rmse(y_val, y_pred))

round(np.std(scores), 3)

0.004

### Question 6

In [13]:
X_train, y_train, X_val, y_val, X_test, y_test = get_data_split(seed=9)

X_train = pd.concat([X_train, X_val], axis=0)
y_train = np.concatenate([y_train, y_val])

In [14]:
fill_fnc = lambda x: x.fillna(0)
w_0, w = train_linear_regression_reg(X_train.apply(fill_fnc), y_train, r=0.001)

y_pred = w_0 + X_test.apply(fill_fnc).dot(w)
round(rmse(y_test, y_pred), 3)


0.345