In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
#os.remove('/kaggle/working/housing.csv.2')
data='https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
!wget $data

df_org = pd.read_csv('housing.csv')

--2022-09-25 11:27:41--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2022-09-25 11:27:42 (19.8 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [3]:
columns = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']
columns

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [4]:
df = df_org[columns].fillna(0)
df['rooms_per_household'] = df.total_rooms / df.households
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms
df['population_per_household'] = df.population / df.households

df.head().T
# df_org.head().T

Unnamed: 0,0,1,2,3,4
latitude,37.88,37.86,37.85,37.85,37.85
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY


## Question 1
What is the most frequent observation (mode) for the column ocean_proximity?

In [5]:
df.ocean_proximity.mode()

0    <1H OCEAN
dtype: object

## Make median_house_value binary
* We need to turn the median_house_value variable from numeric into binary.
* Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [6]:
# binarizing median_house_value under new name 'above_average'
df['above_average'] = (df.median_house_value > df.median_house_value.mean()).astype(int)

## Split the data
* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value (median_house_value) is not in your dataframe.

In [7]:
# Splitting the data set

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

print(len(df_train), len(df_val), len(df_test))

df_train.reset_index()
df_val.reset_index()
df_test.reset_index()

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

for c in ['median_house_value']:
    del df_train[c]
    del df_val[c]
    del df_test[c]

12384 4128 4128


## Question 2
* Create the correlation matrix for the numerical features of your **train** dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?

In [8]:
df.dtypes

latitude                    float64
longitude                   float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
above_average                 int64
dtype: object

In [9]:
numerical = list((df.dtypes[df.dtypes != 'object'].index))

# remove target variables
numerical.remove('median_house_value') 
numerical.remove('above_average')

numerical

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [10]:
categorical = list((df.dtypes[df.dtypes == 'object'].index))
categorical

['ocean_proximity']

In [11]:
corr_matrix = list()

for c in numerical:
    for o in numerical:
        if c == o: continue
        corr_matrix.append({'name' : '%s___%s' % (c, o),
                      'corr' : df_train[c].corr(df_train[o])})

sorted(corr_matrix, key = lambda d: d['corr'], reverse=True)[0]

{'name': 'total_bedrooms___households', 'corr': 0.979399352769416}

## Question 3
* Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
* What is the value of mutual information?
* Round it to 2 decimal digits using round(score, 2)

In [12]:
mutual_info_score(df_train.above_average, df_train.ocean_proximity).round(2)

0.1

## Question 4
* Now let's train a logistic regression
* Remember that we have one categorical variable `ocean_proximity` in the data. Include it using one-hot encoding.
* Fit the model on the training dataset.
  * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  * `model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [13]:
dicts_train = df_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts_train)

In [14]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train) #<-- training the model

w = list(model.coef_[0])
w0 = model.intercept_[0]
(w0, w)

(-0.083829188440655,
 [0.18093530759982326,
  0.003954758787947762,
  0.03573955469112594,
  0.11598325702417112,
  0.08716280496403875,
  1.2120866441523424,
  0.473101980247531,
  -1.7147713304052514,
  0.01882696615444871,
  0.29153725950298887,
  0.8474759360596523,
  -0.0016371871780485171,
  0.010323040002691304,
  -0.014052207665170789,
  0.0018753074674865912,
  -0.00014597082449926822])

In [15]:
# Calculate the predictions for the validation data set using the model trained with training data

dicts_val = df_val[categorical + numerical].to_dict(orient='records')
X_valid = dv.transform(dicts_val)
model.predict_proba(X_valid)[:,1]
val_pred = model.predict_proba(X_valid)[:,1] >= 0.5

# Calculate relative frequency
(val_pred == y_val).mean().round(2)

0.84