In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")

features = [
    'latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
    'population', 'households', 'median_income', 'ocean_proximity'
]

target = 'median_house_value'

df = df[features + [target]]
df = df.fillna(0)

df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

features += ['rooms_per_household', 'bedrooms_per_room', 'population_per_household']

### Question 1

In [3]:
df['ocean_proximity'].mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

### Split the data

In [4]:
train_val, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train_val, test_size=0.25, random_state=42)

# Check train/val/test nrows adds up to original df
assert df.shape[0] == (train.shape[0] + val.shape[0] + test.shape[0])

# Check train/val/test is 60/20/20
assert train.shape[0] == int(df.shape[0] * 0.6)
assert val.shape[0] == test.shape[0]

# Check train/val/test is disjoint
assert len(set(df.index).difference(train.index).difference(val.index).difference(test.index)) == 0

X_train, y_train = train[features], train[target].values
X_val, y_val = val[features], val[target].values
X_test, y_test = test[features], test[target].values

### Question 2

In [5]:
corrs = X_train.select_dtypes(np.number).corr()

print(corrs.loc['total_bedrooms', 'households'])
print(corrs.loc['total_bedrooms', 'total_rooms'])
print(corrs.loc['population', 'households'])
print(corrs.loc['population_per_household', 'total_rooms'])

0.9793993527694163
0.9315462999468395
0.9068406743022618
-0.029451679411510792


### Make `median_house_value` binary

In [6]:
mean_value = df['median_house_value'].mean()

y_train_bin = (y_train > mean_value).astype(np.uint8)
y_val_bin = (y_val > mean_value).astype(np.uint8)
y_test_bin = (y_test > mean_value).astype(np.uint8)

### Question 3

In [7]:
mutual_info_classif(
    LabelEncoder().fit_transform(X_train['ocean_proximity']).reshape(-1, 1),
    y_train_bin,
    discrete_features=True
)

array([0.10138386])

### Question 4

In [8]:
ct = ColumnTransformer([
    ("onehot", OneHotEncoder(), ['ocean_proximity']),
], remainder='passthrough')

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

pipe = Pipeline([
    ("onehot", ct),
    ("model", model)
])

In [9]:
pipe.fit(X_train, y_train_bin)
original_accuracy = pipe.score(X_val, y_val_bin)
print(round(original_accuracy, 2))

0.84


### Question 5

In [10]:
cols_to_drop = ['total_rooms', 'total_bedrooms', 'population', 'households']

for col in cols_to_drop:
    pipe.fit(X_train.drop(columns=[col]), y_train_bin)
    new_accuracy = pipe.score(X_val, y_val_bin)
    diff = original_accuracy - new_accuracy
    print(f'Dropped column: {col}')
    print(f'\tOriginal accuracy: {original_accuracy}')
    print(f'\tNew accuracy: {new_accuracy}')
    print(f'\tDiff: {diff}\n')


Dropped column: total_rooms
	Original accuracy: 0.8352713178294574
	New accuracy: 0.8374515503875969
	Diff: -0.0021802325581394832

Dropped column: total_bedrooms
	Original accuracy: 0.8352713178294574
	New accuracy: 0.8357558139534884
	Diff: -0.0004844961240310086

Dropped column: population
	Original accuracy: 0.8352713178294574
	New accuracy: 0.8263081395348837
	Diff: 0.008963178294573715

Dropped column: households
	Original accuracy: 0.8352713178294574
	New accuracy: 0.8330910852713178
	Diff: 0.0021802325581395943



### Question 6

In [11]:
def calc_rmse(y_pred, y_true):
    error = y_pred - y_true
    mse = (error ** 2).mean()
    return np.sqrt(mse)

Xp_train = ct.fit_transform(X_train)


In [23]:
for a in [0, 0.01, 0.1, 1, 10]:
    reg = Ridge(alpha=a, solver="sag", random_state=42)
    reg.fit(Xp_train, np.log(y_train))

    preds = reg.predict(ct.transform(X_val))
    rmse = mean_squared_error(preds, np.log(y_val), squared=False)
    print(f'a={a}, RMSE={rmse :.3f}')

a=0, RMSE=0.524
a=0.01, RMSE=0.524
a=0.1, RMSE=0.524
a=1, RMSE=0.524
a=10, RMSE=0.524
