In [14]:
import pandas as pd
import numpy as np

In [50]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
data = pd.read_csv(url)

In [52]:
data = data[(data['ocean_proximity']=='INLAND') | (data['ocean_proximity']=='<1H OCEAN')]
df = data[['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']].copy()

# Question 1



In [12]:
df.isna().sum()

# Question 2

In [13]:
df.population.median()

# Question 3

In [53]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [54]:
df['median_house_value'] = np.log1p(df['median_house_value'])

In [55]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [56]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [57]:
def fill_X(df,value):
    df_num = df.copy()
    df_num = df_num.fillna(value)
    X = df_num.values
    return X
def train_LR(X, y):
    X = np.column_stack([np.ones(X.shape[0]), X])
    XTX = X.T@X
    inv = np.linalg.inv(XTX)
    w = inv@X.T@y
    return w

In [58]:
Avg_TtBed = df_train.total_bedrooms.mean()

X1_train = fill_X(df_train,Avg_TtBed)
X2_train =  fill_X(df_train,0)

X1_val = fill_X(df_val,Avg_TtBed)
X2_val =  fill_X(df_val,0)
w1 = train_LR(X1_train,y_train)
w2 = train_LR(X2_train,y_train)
def rmse(y, y_pred):
    v = y-y_pred
    v = v**2 
    v = v.mean()
    v = np.sqrt(v)
    return v


In [59]:
y1_pred = w1[0] + X1_val@w1[1:]
y2_pred = w2[0] + X2_val@w2[1:]
print(round(rmse(y_val,y1_pred),2),round(rmse(y_val,y2_pred),2))

0.0 0.0


# Question 4

In [64]:
def train_linear_regression_reg(X, y, r=0.001):
    X = np.column_stack([np.ones(X.shape[0]), X])
    XTX = X.T@X + r* np.eye(X.shape[1])
    inv = np.linalg.inv(XTX)
    w = inv@X.T@y
    return w

In [67]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    X_train = df_train
    w = train_linear_regression_reg(X2_train, y_train, r=r)
    y_pred = w[0] + X2_val@w[1:]
    score = round(rmse(y_val, y_pred),2)
    
    print(r,score)

0 0.0
1e-06 0.0
0.0001 0.0
0.001 0.0
0.01 0.0
0.1 0.0
1 0.01
5 0.01
10 0.01


# Question 5

In [78]:
def train_test_split(df,idx):
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    return  df_train, df_val, df_test, y_train, y_val, y_test

In [86]:
seed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
scores = []

for seed in seed_list:
    idx = np.arange(n)

    np.random.seed(seed)
    np.random.shuffle(idx)

    df_train, df_val, df_test, y_train, y_val, y_test = train_test_split(df,idx)
    
    X_train =  fill_X(df_train,0)
    X_val =  fill_X(df_val,0)
    
    w = train_linear_regression_reg(X_train,y_train)
    y_pred = w[0] + X_val@w[1:]
    
    scores.append(rmse(y_val,y_pred))

In [88]:
round(np.std(np.array(scores)),3)

# Question 6

In [92]:
idx = np.arange(n)

np.random.seed(9)
np.random.shuffle(idx)

df_train, df_val, df_test, y_train, y_val, y_test = train_test_split(df,idx)    

    
X_train =  fill_X(df_train,0)
X_val =  fill_X(df_val,0)
X_test = fill_X(df_test,0)

X_train = np.vstack([X_train,X_val])
y_train = np.hstack([y_train,y_val])


w = train_linear_regression_reg(X_train,y_train,0.001)
y_pred = w[0] + X_test@w[1:]
    
print(round(rmse(y_test,y_pred),2))

0.03
