In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('housing.csv')
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
6231,-117.93,34.07,36.0,1207.0,209.0,683.0,213.0,5.3559,207300.0,<1H OCEAN
6603,-118.16,34.19,44.0,2195.0,449.0,1377.0,417.0,3.5887,153500.0,<1H OCEAN
14591,-117.17,32.82,24.0,1623.0,417.0,911.0,397.0,2.7401,198100.0,NEAR OCEAN
4398,-118.28,34.08,42.0,1618.0,522.0,1454.0,440.0,3.1607,182000.0,<1H OCEAN
7585,-118.22,33.9,40.0,1802.0,496.0,2096.0,468.0,2.3542,97900.0,<1H OCEAN


In [3]:
data = data.dropna()

In [4]:
data.shape

(20433, 10)

In [6]:
data.loc[data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [7]:
data = data.drop(data.loc[data['median_house_value'] == 500001].index)

In [8]:
data.shape

(19475, 10)

In [9]:
data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [10]:
data = pd.get_dummies(data, columns=['ocean_proximity'])

In [11]:
data.shape

(19475, 14)

In [14]:
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
3966,-118.57,34.2,36.0,2559.0,469.0,1358.0,445.0,4.5568,201500.0,1,0,0,0,0
12831,-121.44,38.68,19.0,2476.0,534.0,1355.0,463.0,2.0625,94400.0,0,1,0,0,0
11201,-117.92,33.83,17.0,382.0,86.0,272.0,81.0,1.425,212500.0,1,0,0,0,0
20444,-118.9,34.3,13.0,5591.0,1013.0,3188.0,971.0,5.5925,208600.0,1,0,0,0,0
13614,-117.26,34.13,37.0,2403.0,550.0,1234.0,493.0,2.0,72100.0,0,1,0,0,0


In [15]:
#We can use the same dataset by calculating median of housing prices and then trying to predict whether a particular
#neighborhood will have a house above the median value or below the median value

#This is how we simply convert a regression problem into a classification probelm 


median = data['median_house_value'].median()
median

173800.0

In [17]:
#We'll now add a new column in our dataset called above median 

data['above_median'] = (data['median_house_value'] - median) > 0

# above_median will contain boolean value it will have True if particular house is above median 
# otherwise it'll be False

In [18]:
data.shape

(19475, 15)

In [21]:
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
16972,-122.31,37.56,45.0,1685.0,321.0,815.0,314.0,4.2955,309700.0,0,0,0,0,1,True
15801,-122.44,37.76,30.0,5089.0,1210.0,1935.0,1139.0,4.6053,386100.0,0,0,0,1,0,True
2739,-115.57,32.78,25.0,2007.0,301.0,1135.0,332.0,5.128,99600.0,0,1,0,0,0,False
10560,-117.7,33.63,23.0,3038.0,473.0,1501.0,436.0,5.5584,241700.0,1,0,0,0,0,True
20218,-119.24,34.28,41.0,1280.0,240.0,608.0,252.0,4.4038,229100.0,0,0,0,0,1,True


In [22]:
# Lets now stup features and labels

X = data.drop(['median_house_value','above_median'], axis = 1)
Y = data['above_median']

In [24]:
#This is a binary classification probelem our output will be either True or False

# True if the house price prediction is above the median
# False if the house price prediction is below the median

#Below are the features that we'll use to train our logestic regression clsssifier

X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [25]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [26]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

#The algorithm to use the optimization problem - the liblinear solver is a good choice for 
#small datasets and binary classification 

In [27]:
print('Training Score : ', logistic_model.score(x_train, y_train))

Training Score :  0.8177150192554558


In [28]:
y_pred = logistic_model.predict(x_test)

In [30]:
#Create a new dataframe with predicted and actual data

df_pred_actual = pd.DataFrame({'predicted' : y_pred, 'actual' : y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
11886,False,False
16237,False,False
16429,True,True
8131,True,True
13128,False,False
13344,False,False
19717,False,False
15859,True,True
4551,False,True
1854,False,False


In [31]:
#Accuracy of testing score

from sklearn.metrics import accuracy_score

print('Testing Score : ', accuracy_score(y_test, y_pred))

Testing Score :  0.8272143774069319
