In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression,LogisticRegression


# All plots can be displayed on notebooks:
%matplotlib inline

# Make better use of Jupyter Notebook cell width

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))
pd.options.display.float_format = '{:.2f}'.format

In [2]:
# First getting data:

!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
    
df = pd.read_csv('housing.csv')

--2022-10-10 11:04:29--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1,4M) [text/plain]
Saving to: ‘housing.csv’


2022-10-10 11:04:31 (1,82 MB/s) - ‘housing.csv’ saved [1423529/1423529]



# Data preparation
- Select only the features from above and fill in the missing values with 0.
- Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
- Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
- Create a new column population_per_household by dividing the column population by the column households from dataframe.

In [3]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

df = df[features]

In [4]:
df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.00,880.00,129.00,322.00,126.00,8.33,452600.00,NEAR BAY
1,37.86,-122.22,21.00,7099.00,1106.00,2401.00,1138.00,8.30,358500.00,NEAR BAY
2,37.85,-122.24,52.00,1467.00,190.00,496.00,177.00,7.26,352100.00,NEAR BAY
3,37.85,-122.25,52.00,1274.00,235.00,558.00,219.00,5.64,341300.00,NEAR BAY
4,37.85,-122.25,52.00,1627.00,280.00,565.00,259.00,3.85,342200.00,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.00,1665.00,374.00,845.00,330.00,1.56,78100.00,INLAND
20636,39.49,-121.21,18.00,697.00,150.00,356.00,114.00,2.56,77100.00,INLAND
20637,39.43,-121.22,17.00,2254.00,485.00,1007.00,433.00,1.70,92300.00,INLAND
20638,39.43,-121.32,18.00,1860.00,409.00,741.00,349.00,1.87,84700.00,INLAND


In [5]:
# Initial cleaning:

df.columns = df.columns.str.lower().str.replace(' ','_')

categorical_cols = list(df.dtypes[df.dtypes == 'object'].index)
numeric_cols = list(df.dtypes[df.dtypes != 'object'].index)

for c in categorical_cols:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [6]:
categorical_cols

['ocean_proximity']

In [7]:
numeric_cols

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [8]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [9]:
#@ HANDLING MISSING VALUES:
median = df["total_bedrooms"].median()                                 
df["total_bedrooms"].fillna(median, inplace=True) 

In [10]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

# Question - 1 : What is the most frequent observation (mode) for the column ocean_proximity?

In [11]:
df['ocean_proximity'].mode()

0    <1h_ocean
Name: ocean_proximity, dtype: object

# Question -2 : Create the correlation matrix for the numerical features of your train dataset.
- In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [12]:
mean = df.median_house_value.mean()
df['above_average'] = df['median_house_value'].map(lambda x: 1 if x>mean else 0)

In [30]:
#del df['median_house_value']

In [13]:
# Splitting data to train,validation and test set:

df_full_train, df_test = train_test_split(df,test_size=0.2,random_state=42)

# Further splitting of training and validation data:

df_train, df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [14]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [15]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [16]:
# Checking shapes of dataframes:

In [16]:
len(df_full_train),len(df_train),len(df_val),len(df_test)

(16512, 12384, 4128, 4128)

In [40]:
df_full_train

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,above_average,population_per_household
0,32.71,-117.03,33.00,3126.00,627.00,3.69,623.00,3.26,103000.00,near_ocean,5.02,0.20,0,0.01
1,33.77,-118.16,49.00,3382.00,787.00,1.74,756.00,3.81,382100.00,near_ocean,4.47,0.23,1,0.00
2,34.66,-120.48,4.00,1897.00,331.00,2.72,336.00,4.16,172600.00,near_ocean,5.65,0.17,0,0.01
3,32.69,-117.11,36.00,1421.00,367.00,3.99,355.00,1.94,93400.00,near_ocean,4.00,0.26,0,0.01
4,36.78,-119.80,43.00,2382.00,431.00,2.30,380.00,3.55,96500.00,inland,6.27,0.18,0,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,33.78,-117.96,35.00,1330.00,201.00,3.03,217.00,6.37,229200.00,<1h_ocean,6.13,0.15,1,0.01
16508,34.02,-117.43,33.00,3084.00,570.00,3.90,449.00,3.05,97800.00,inland,6.87,0.18,0,0.01
16509,34.03,-118.38,36.00,2101.00,569.00,3.33,527.00,2.93,222100.00,<1h_ocean,3.99,0.27,1,0.01
16510,37.58,-121.96,15.00,3575.00,597.00,3.18,559.00,5.72,283500.00,<1h_ocean,6.40,0.17,1,0.01


In [17]:
# Correlation for numerical features:
df[numeric_cols].corr().unstack().sort_values(ascending=False)[:15]

latitude            latitude             1.00
longitude           longitude            1.00
median_income       median_income        1.00
households          households           1.00
population          population           1.00
total_rooms         total_rooms          1.00
housing_median_age  housing_median_age   1.00
total_bedrooms      total_bedrooms       1.00
median_house_value  median_house_value   1.00
households          total_bedrooms       0.97
total_bedrooms      households           0.97
total_rooms         total_bedrooms       0.93
total_bedrooms      total_rooms          0.93
total_rooms         households           0.92
households          total_rooms          0.92
dtype: float64

In [18]:
# answer: total_bedrooms      households  0.97

# Question 3
- Calculate the mutual information score between above_average and ocean_proximity . Use the training set only.
- Round it to 2 decimals using round(score, 2)
- What is their mutual information score?

In [1]:
from sklearn.metrics import mutual_info_score

In [None]:
mutual_info_score

In [19]:
#@ PREPARING DATASET:
cat = ['ocean_proximity']

#@ DEFINING FUNCTION:
def calculate_mi(series):
    return mutual_info_score(series, df_train.above_average)

#@ IMPLEMENTATION:
df_mi = df_train[cat].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
ocean_proximity,0.1


# Question 4
- Now let's train a logistic regression
- Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
- model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
#@ PREPARING THE DATASET:
df_train = df_train.drop('above_average', axis=1)
df_val = df_val.drop('above_average', axis=1)
df_test = df_test.drop('above_average', axis=1)

In [22]:
categorical_cols

['ocean_proximity']

In [23]:
# redefining numeric cols to include new features:

numeric_cols = ["longitude", "latitude", "housing_median_age", "total_rooms",
                "total_bedrooms", "population", "households",
                "median_income", "rooms_per_household", 
                "bedrooms_per_room", "population_per_household"]
numeric_cols

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [24]:
categorical_cols

['ocean_proximity']

In [25]:
train_dicts = df_train[categorical_cols + numeric_cols].to_dict(orient='records')

# Vectorizing:
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

In [26]:
# Training Logistic regression model:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

# Model prediction:
val_dict = df_val[categorical_cols + numeric_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict(X_val)

# Accuracy:
accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

0.84


# Question 5:
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?

In [29]:
features = categorical_cols + numeric_cols
features

['ocean_proximity',
 'longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [44]:
del orig_score,original_score,listt

In [50]:
#@ INSPECTING THE DIFFERENCE IN ACCURACY:
orig_score = accuracy

for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(c, orig_score - score, score)

ocean_proximity 0.019505813953488382 0.8204941860465116
longitude 0.007877906976744176 0.8321220930232558
latitude 0.006666666666666599 0.8333333333333334
housing_median_age 0.008604651162790633 0.8313953488372093
total_rooms 0.003517441860465098 0.8364825581395349
total_bedrooms 0.0030329457364340895 0.8369670542635659
population 0.013691860465116279 0.8263081395348837
households 0.00618217054263559 0.8338178294573644
median_income 0.05511627906976746 0.7848837209302325
rooms_per_household 0.004970930232558124 0.8350290697674418
bedrooms_per_room 0.004728682170542564 0.8352713178294574
population_per_household 0.004001937984496107 0.8359980620155039


In [46]:
# answer is total_bedrooms

In [None]:
#print(c, orig_score.round(2) - score.round(2),orig_score.round(2), score.round(2))

# Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

In [71]:
df['median_house_value'] = np.log1p(df['median_house_value'])

In [96]:
df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,37.88,-122.23,41.00,880.00,129.00,322.00,126.00,8.33,13.02,near_bay,6.98,0.15,2.56,1
1,37.86,-122.22,21.00,7099.00,1106.00,2401.00,1138.00,8.30,12.79,near_bay,6.24,0.16,2.11,1
2,37.85,-122.24,52.00,1467.00,190.00,496.00,177.00,7.26,12.77,near_bay,8.29,0.13,2.80,1
3,37.85,-122.25,52.00,1274.00,235.00,558.00,219.00,5.64,12.74,near_bay,5.82,0.18,2.55,1
4,37.85,-122.25,52.00,1627.00,280.00,565.00,259.00,3.85,12.74,near_bay,6.28,0.17,2.18,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.00,1665.00,374.00,845.00,330.00,1.56,11.27,inland,5.05,0.22,2.56,0
20636,39.49,-121.21,18.00,697.00,150.00,356.00,114.00,2.56,11.25,inland,6.11,0.22,3.12,0
20637,39.43,-121.22,17.00,2254.00,485.00,1007.00,433.00,1.70,11.43,inland,5.21,0.22,2.33,0
20638,39.43,-121.32,18.00,1860.00,409.00,741.00,349.00,1.87,11.35,inland,5.33,0.22,2.12,0


In [78]:
# SPLITTING THE DATASET:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [79]:
# PREPARING THE DATASET:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [80]:
# PREPARING THE DATASET:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [81]:
# DELETING DATASET:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# Ridgre regression:

In [82]:
#@ PREPARING THE DATASET:
train_dict = df_train[categorical_cols + numeric_cols].to_dict(orient='records')

In [85]:
# VECTORIZING THE DATASET:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = df_val[categorical_cols + numeric_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [91]:
from sklearn.linear_model import ridge_regression,Ridge
from sklearn.metrics import mean_squared_error

In [95]:
# RIDGE REGRESSION IMPLEMENTATION:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 3))

0 0.524
0.01 0.524
0.1 0.524
1 0.524
10 0.524
