In [2]:
import pandas as pd

import matplotlib.pyplot as plt

In [4]:
housing_data = pd.read_csv('housing.csv')

housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
17031,-122.25,37.5,44.0,348.0,79.0,154.0,73.0,4.7708,253800.0,NEAR OCEAN
10110,-117.95,33.92,13.0,2312.0,592.0,2038.0,559.0,3.1378,137000.0,<1H OCEAN
19311,-122.89,38.38,16.0,2017.0,369.0,931.0,336.0,5.7664,267500.0,<1H OCEAN
5893,-118.31,34.16,38.0,2347.0,665.0,1317.0,547.0,3.2112,349300.0,<1H OCEAN
17269,-119.7,34.4,25.0,1858.0,493.0,865.0,460.0,3.0938,312500.0,NEAR OCEAN


In [5]:
housing_data = housing_data.dropna()

In [6]:
housing_data.shape

(20433, 10)

In [7]:
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [8]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [9]:
housing_data.shape

(19475, 10)

In [10]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [11]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [12]:
housing_data.shape

(19475, 14)

In [13]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
1576,-121.99,37.81,17.0,465.0,83.0,146.0,75.0,4.9018,188500.0,1,0,0,0,0
14533,-117.13,32.93,16.0,2918.0,444.0,1697.0,444.0,5.3062,195500.0,1,0,0,0,0
14233,-117.01,32.7,7.0,2327.0,490.0,1304.0,445.0,3.3553,132200.0,0,0,0,0,1
2834,-118.99,35.4,43.0,2225.0,392.0,890.0,374.0,4.0208,90400.0,0,1,0,0,0
14112,-117.11,32.74,25.0,2846.0,644.0,2272.0,632.0,2.2,98700.0,0,0,0,0,1


In [15]:
median = housing_data['median_house_value'].median()

In [16]:
median

173800.0

In [17]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [18]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
3395,-118.3,34.26,40.0,1065.0,214.0,605.0,183.0,4.1964,185900.0,1,0,0,0,0,True
19234,-122.69,38.51,18.0,3364.0,501.0,1442.0,506.0,6.6854,313000.0,1,0,0,0,0,True
1907,-120.04,38.86,16.0,2708.0,481.0,712.0,261.0,3.7891,117700.0,0,1,0,0,0,False
10456,-117.64,33.49,3.0,2516.0,429.0,781.0,337.0,5.6197,271600.0,1,0,0,0,0,True
16316,-121.32,38.0,21.0,1795.0,482.0,1114.0,472.0,2.0091,101500.0,0,1,0,0,0,False


In [19]:
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [20]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [21]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) 

In [22]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [23]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [24]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [25]:
print("Training score : ", logistic_model.score(x_train, y_train))

Training score :  0.8197047496790757


In [26]:
y_pred = logistic_model.predict(x_test)

In [27]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})

df_pred_actual.head(10)

Unnamed: 0,predicted,actual
2270,False,False
2849,False,False
1569,True,True
20142,False,True
20172,True,True
14612,True,False
9648,False,False
85,False,False
19321,True,False
8995,True,True


In [28]:
from sklearn.metrics import accuracy_score

print("Testing score : ", accuracy_score(y_test, y_pred))

Testing score :  0.8228498074454429
