In [24]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [25]:
housing_data = pd.read_csv('datasets/housing.csv')

## Clean data

In [26]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)
housing_data = housing_data.dropna()

In [27]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

## Prepare data

In [28]:
median = housing_data['median_house_value'].median()
median

173800.0

In [29]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [30]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
16634,-120.84,35.31,23.0,3100.0,603.0,1515.0,609.0,2.8493,196100.0,0,0,0,0,1,True
3299,-122.62,38.95,19.0,2230.0,538.0,832.0,359.0,1.6865,58800.0,0,1,0,0,0,False
5684,-118.28,33.68,8.0,2842.0,522.0,1624.0,510.0,3.7282,287500.0,0,0,0,0,1,True
14951,-116.96,32.71,18.0,2413.0,533.0,1129.0,551.0,2.4567,155000.0,1,0,0,0,0,False
315,-122.19,37.76,45.0,995.0,238.0,630.0,237.0,1.925,74100.0,0,0,0,1,0,False


## Set Features and Targets

In [31]:
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [32]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [33]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [34]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [35]:
y_train.shape, y_test.shape

((15580,), (3895,))

## Logistic regression

In [39]:
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [40]:
print('Training score:', logistic_model.score(x_train, y_train))

Training score: 0.8209242618741976


## Visualizing results

In [41]:
y_pred = logistic_model.predict(x_test)

In [42]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
5465,True,True
5518,True,True
7822,True,True
20353,True,True
2325,False,False
18993,False,True
19311,True,True
19611,False,False
12506,False,False
19135,True,True
