In [95]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sb
from sklearn.model_selection import train_test_split # for train test split
from sklearn.metrics import mutual_info_score # for mutual information score
from sklearn.metrics import accuracy_score # for accuracy score
from sklearn.feature_extraction import DictVectorizer # for one-hot encoding
from sklearn.linear_model import LogisticRegression # for log reg
import pickle

In [96]:
df = pd.read_csv("housing.csv")
df = df[['latitude',
         'longitude',
         'housing_median_age',
         'total_rooms',
         'total_bedrooms',
         'population',
         'households',
         'median_income',
         'median_house_value',
         'ocean_proximity']].copy()

In [97]:
df.isna().sum()[df.isna().sum()>0]

total_bedrooms    207
dtype: int64

In [98]:
# Data preparation

# Select only the features from above and fill in the missing values with 0
df['total_bedrooms'] = df['total_bedrooms'].fillna(0, inplace=False)


In [99]:
# Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
df['rooms_per_household'] = df['total_rooms'] / df['households']
df[['total_rooms', 'households', 'rooms_per_household']].head()

Unnamed: 0,total_rooms,households,rooms_per_household
0,880.0,126.0,6.984127
1,7099.0,1138.0,6.238137
2,1467.0,177.0,8.288136
3,1274.0,219.0,5.817352
4,1627.0,259.0,6.281853


In [100]:
# Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df[['total_bedrooms', 'total_rooms', 'bedrooms_per_room']].head()

Unnamed: 0,total_bedrooms,total_rooms,bedrooms_per_room
0,129.0,880.0,0.146591
1,1106.0,7099.0,0.155797
2,190.0,1467.0,0.129516
3,235.0,1274.0,0.184458
4,280.0,1627.0,0.172096


In [101]:
# Create a new column population_per_household by dividing the column population by the column households from dataframe.
df['population_per_household'] = df['population'] / df['households']
df[['population', 'households', 'population_per_household']].head()

Unnamed: 0,population,households,population_per_household
0,322.0,126.0,2.555556
1,2401.0,1138.0,2.109842
2,496.0,177.0,2.80226
3,558.0,219.0,2.547945
4,565.0,259.0,2.181467


In [102]:
# Question 1
# What is the most frequent observation (mode) for the column ocean_proximity?
df['ocean_proximity'].value_counts()
# Options:
# NEAR BAY
# --> <1H OCEAN <--
# INLAND
# NEAR OCEAN

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [103]:
list(df.columns)

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [104]:
# Split the data
# Split your data in train/val/test sets, with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
# Make sure that the target value (median_house_value) is not in your dataframe.
X_train_valid, X_test, y_train_valid, y_test = train_test_split(
    df.drop('median_house_value', axis = 1),
    df[['median_house_value']],
    train_size = 0.8,
    test_size = 0.2,
    random_state = 42)

print(df.shape)
print()
print(X_train_valid.shape)
print(X_test.shape)
print()
print(y_train_valid.shape)
print(y_test.shape)

(20640, 13)

(16512, 12)
(4128, 12)

(16512, 1)
(4128, 1)


In [105]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid,
    y_train_valid,
    train_size = 0.75,
    test_size = 0.25,
    shuffle = False)
print(X_train.shape)
print(X_valid.shape)
print()
print(y_train.shape)
print(y_valid.shape)

(12384, 12)
(4128, 12)

(12384, 1)
(4128, 1)


In [106]:
# Question 2
# Create the correlation matrix for the numerical features of your train dataset.
num_columns = list(X_train.dtypes[df.dtypes == 'float64'].index)
cat_columns = list(X_train.dtypes[df.dtypes == 'object'].index)
corr_matrix = X_train[num_columns].corr()
corr_matrix.abs().unstack().sort_values(ascending=False).drop_duplicates()
# In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
# What are the two features that have the biggest correlation in this dataset?
# Options:
# --> total_bedrooms and households <--
# total_bedrooms and total_rooms
# population and households
# population_per_household and total_rooms

latitude                  latitude                    1.000000
households                total_bedrooms              0.980122
total_rooms               total_bedrooms              0.932220
longitude                 latitude                    0.924692
households                total_rooms                 0.922343
population                households                  0.912609
total_bedrooms            population                  0.883679
total_rooms               population                  0.864225
bedrooms_per_room         median_income               0.612738
                          rooms_per_household         0.495944
rooms_per_household       median_income               0.390775
total_rooms               housing_median_age          0.357091
housing_median_age        total_bedrooms              0.316835
                          households                  0.299384
                          population                  0.295541
total_rooms               median_income               0

In [107]:
# Make median_house_value binary
# We need to turn the median_house_value variable from numeric into binary.
# Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.
X_train['above_average'] = (y_train['median_house_value'] > y_train['median_house_value'].mean()).astype(int)
X_valid['above_average'] = (y_valid['median_house_value'] > y_valid['median_house_value'].mean()).astype(int)
X_test['above_average'] = (y_test['median_house_value'] > y_test['median_house_value'].mean()).astype(int)

In [108]:
# Question 3
# Calculate the mutual information score with the (binarized) price 
# for the categorical variable that we have. Use the training set only.
# What is the value of mutual information?
mutual_info_score(X_train['above_average'],X_train['ocean_proximity']).round(2)
# Round it to 2 decimal digits using round(score, 2)
# Options:
# 0.263
# 0.00001
# --> 0.101 <--
# 0.15555

0.1

In [109]:
# Question 4
# Now let's train a logistic regression
# Remember that we have one categorical variable ocean_proximity in the data. 
# Include it using one-hot encoding.
train_dict = X_train[cat_columns + num_columns].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
print(dv.get_feature_names_out())
X_train = dv.transform(train_dict)

valid_dict = X_valid[cat_columns + num_columns].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(valid_dict)
X_valid = dv.transform(valid_dict)

test_dict = X_test[cat_columns + num_columns].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test = dv.transform(test_dict)


['bedrooms_per_room' 'households' 'housing_median_age' 'latitude'
 'longitude' 'median_income' 'ocean_proximity=<1H OCEAN'
 'ocean_proximity=INLAND' 'ocean_proximity=ISLAND'
 'ocean_proximity=NEAR BAY' 'ocean_proximity=NEAR OCEAN' 'population'
 'population_per_household' 'rooms_per_household' 'total_bedrooms'
 'total_rooms']


In [110]:
y_train.values.ravel()

array([103000., 382100., 172600., ..., 144300., 154700., 158800.])

In [111]:
X_train.shape

(12384, 16)

In [112]:
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, 
# fit the model with these parameters:
model = LogisticRegression(solver = "liblinear", 
                           C = 1.0, 
                           max_iter = 1000, 
                           random_state = 42)

model.fit(X_train, 
         y_train.values.ravel())

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [113]:
# filename = 'hw3_logistic_regression.sav'

# pickle.dump(model, open(filename, 'wb'))

# some time later...
 
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# print(result)

In [114]:
X_valid.shape

(4128, 16)

In [126]:
y_valid_pred = model.predict(X_valid)

In [127]:
y_valid_pred.shape

(4128,)

In [128]:
y_valid.shape

(4128, 1)

In [129]:
y_valid.head

<bound method NDFrame.head of        median_house_value
9221              58800.0
10234            165800.0
4763             139400.0
9062             107500.0
1874             107200.0
...                   ...
11284            229200.0
11964             97800.0
5390             222100.0
860              283500.0
15795            325000.0

[4128 rows x 1 columns]>

In [130]:
y_valid_pred_df = pd.DataFrame(y_valid_pred, columns = ['median_house_value'])
y_valid_pred_df.head

<bound method NDFrame.head of       median_house_value
0               112500.0
1               137500.0
2               500001.0
3                67500.0
4               137500.0
...                  ...
4123            500001.0
4124             67500.0
4125            162500.0
4126            500001.0
4127            400000.0

[4128 rows x 1 columns]>

In [131]:
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
accuracy_score(y_valid_pred, y_valid)
# Options:
# 0.60
# 0.72
# 0.84
# 0.95

0.04869186046511628

In [None]:
model.predict_proba(X_valid)

In [None]:
# Question 5
# Let's find the least useful feature using the feature elimination technique.
# Train a model with all these features (using the same parameters as in Q4).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
# Which of following feature has the smallest difference?
# total_rooms
# total_bedrooms
# population
# households
# note: the difference doesn't have to be positive

In [None]:
# Question 6
# For this question, we'll see how to use a linear regression model from Scikit-Learn
# We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
# Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
# This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
# Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
# If there are multiple options, select the smallest alpha.
# Options:
# 0
# 0.01
# 0.1
# 1
# 10