In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression,LogisticRegression


# All plots can be displayed on notebooks:
%matplotlib inline

# Make better use of Jupyter Notebook cell width

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))
pd.options.display.float_format = '{:.2f}'.format

In [2]:
# First getting data:

!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
    
df = pd.read_csv('housing.csv')

--2022-10-09 15:56:01--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1,4M) [text/plain]
Saving to: ‘housing.csv.4’


2022-10-09 15:56:02 (2,80 MB/s) - ‘housing.csv.4’ saved [1423529/1423529]



# Data preparation
- Select only the features from above and fill in the missing values with 0.
- Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
- Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
- Create a new column population_per_household by dividing the column population by the column households from dataframe.

In [3]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

df = df[features]

In [4]:
# Initial cleaning:

df.columns = df.columns.str.lower().str.replace(' ','_')

categorical_cols = list(df.dtypes[df.dtypes == 'object'].index)
numeric_cols = list(df.dtypes[df.dtypes != 'object'].index)

for c in categorical_cols:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [5]:
categorical_cols

['ocean_proximity']

In [6]:
numeric_cols

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [10]:
# Filling missing values with zero:
df = df.fillna(0)

In [11]:
df.isnull().sum()

latitude               0
longitude              0
housing_median_age     0
total_rooms            0
total_bedrooms         0
population             0
households             0
median_income          0
median_house_value     0
ocean_proximity        0
rooms_per_household    0
bedrooms_per_room      0
dtype: int64

In [12]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population'] = df['population'] / df['households']

# Question - 1 : What is the most frequent observation (mode) for the column ocean_proximity?

In [13]:
df['ocean_proximity'].mode()

0    <1h_ocean
Name: ocean_proximity, dtype: object

# Question -2 : Create the correlation matrix for the numerical features of your train dataset.
- In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [22]:
mean = df.median_house_value.mean()
df['above_average'] = df['median_house_value'].map(lambda x: 1 if x>mean else 0)

In [28]:
del df['median_house_value']

In [29]:
# Splitting data to train,validation and test set:

df_full_train, df_test = train_test_split(df,test_size=0.2,random_state=42)

# Further splitting of training and validation data:

df_train, df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [30]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [32]:
len(df_train),len(df_val),len(df_test)

(12384, 4128, 4128)

In [33]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [37]:
df_full_train

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,above_average
14196,32.71,-117.03,33.00,3126.00,627.00,0.01,623.00,3.26,near_ocean,5.02,0.20,0
8267,33.77,-118.16,49.00,3382.00,787.00,0.00,756.00,3.81,near_ocean,4.47,0.23,1
17445,34.66,-120.48,4.00,1897.00,331.00,0.01,336.00,4.16,near_ocean,5.65,0.17,0
14265,32.69,-117.11,36.00,1421.00,367.00,0.01,355.00,1.94,near_ocean,4.00,0.26,0
2271,36.78,-119.80,43.00,2382.00,431.00,0.01,380.00,3.55,inland,6.27,0.18,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11284,33.78,-117.96,35.00,1330.00,201.00,0.01,217.00,6.37,<1h_ocean,6.13,0.15,1
11964,34.02,-117.43,33.00,3084.00,570.00,0.01,449.00,3.05,inland,6.87,0.18,0
5390,34.03,-118.38,36.00,2101.00,569.00,0.01,527.00,2.93,<1h_ocean,3.99,0.27,1
860,37.58,-121.96,15.00,3575.00,597.00,0.01,559.00,5.72,<1h_ocean,6.40,0.17,1


In [41]:
#numeric_cols.remove('median_house_value')

In [47]:
# Correlation for numerical features:
df[numeric_cols].corr().unstack().sort_values(ascending=False)[:15]

latitude            latitude             1.00
longitude           longitude            1.00
households          households           1.00
population          population           1.00
total_bedrooms      total_bedrooms       1.00
total_rooms         total_rooms          1.00
housing_median_age  housing_median_age   1.00
median_income       median_income        1.00
total_bedrooms      households           0.97
households          total_bedrooms       0.97
total_bedrooms      total_rooms          0.92
total_rooms         total_bedrooms       0.92
households          total_rooms          0.92
total_rooms         households           0.92
median_income       total_rooms          0.20
dtype: float64

In [48]:
# total_bedrooms      households  0.97

# Question 3
- Calculate the mutual information score between above_average and ocean_proximity . Use the training set only.
- Round it to 2 decimals using round(score, 2)
- What is their mutual information score?

In [70]:
#@ PREPARING DATASET:
cat = ['ocean_proximity']

#@ DEFINING FUNCTION:
def calculate_mi(series):
    return mutual_info_score(series, df_train.above_average)

#@ IMPLEMENTATION:
df_mi = df_train[cat].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
ocean_proximity,0.1


In [110]:
df_mi.round(2)

Unnamed: 0,MI
ocean_proximity,0.1


In [75]:
def mutual_calculator(series):
    return mutual_info_score(series, df_train.above_average)

In [78]:
mutual_info_score?

# Question 4
- Now let's train a logistic regression
- Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
- model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [104]:
df_train.head(3)

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,above_average
0,34.43,-119.67,39.0,1467.0,381.0,0.01,374.0,2.37,<1h_ocean,3.92,0.26,1
1,33.74,-118.32,24.0,6097.0,794.0,0.0,806.0,10.14,near_ocean,7.56,0.13,1
2,39.13,-121.62,41.0,1317.0,309.0,0.01,337.0,1.67,inland,3.91,0.23,0


In [105]:
#@ PREPARING THE DATASET:
df_train = df_train.drop('above_average', axis=1)
df_val = df_val.drop('above_average', axis=1)
df_test = df_test.drop('above_average', axis=1)

In [112]:
df_train.columns

Index(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'rooms_per_household', 'bedrooms_per_room'],
      dtype='object')

In [114]:
cat = categorical_cols
num = ['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'rooms_per_household', 'bedrooms_per_room']

In [115]:
train_dicts = df_train[cat + num].to_dict(orient='records')

In [116]:
train_dicts[0]

{'ocean_proximity': '<1h_ocean',
 'latitude': 34.43,
 'longitude': -119.67,
 'housing_median_age': 39.0,
 'total_rooms': 1467.0,
 'total_bedrooms': 381.0,
 'population': 0.010037461751837342,
 'households': 374.0,
 'median_income': 2.3681,
 'rooms_per_household': 3.9224598930481283,
 'bedrooms_per_room': 0.25971370143149286}

In [87]:
dv = DictVectorizer(sparse=False)

In [95]:
X_train = dv.fit_transform(train_dicts)

In [100]:
X_train.shape

(12384, 13)

In [97]:
# Same for validation set:
val_dicts = df_val[categorical_cols + numeric_cols].to_dict(orient='records')

In [99]:
X_val = dv.transform(val_dicts)

In [101]:
X_val.shape

(4128, 13)

In [102]:
# Model:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)