In [205]:
#import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [28]:
!ls

California House Pricing Project.ipynb housing_data.xlsx


In [29]:
#loading the housing data
house_data = pd.read_excel('housing_data.xlsx')

In [206]:
#Printing the first few lines
house_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280,565,259,3.8462,NEAR BAY,342200


In [207]:
#Extract input (X) and output (Y) data from the dataset.
x_feature = house_data.drop('median_house_value', axis = "columns")
y_target = house_data.median_house_value

In [208]:
#feature everything except median_house_value
x_feature.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41,880,129,322,126,8.3252,NEAR BAY
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,NEAR BAY
2,-122.24,37.85,52,1467,190,496,177,7.2574,NEAR BAY
3,-122.25,37.85,52,1274,235,558,219,5.6431,NEAR BAY
4,-122.25,37.85,52,1627,280,565,259,3.8462,NEAR BAY


In [114]:
#Target is median_house_value
y_target.head()

0    452600
1    358500
2    352100
3    341300
4    342200
Name: median_house_value, dtype: int64

In [209]:
#Fill the missing values with the mean of the respective column
#We see that total_bedrooms has null value
x_feature.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
ocean_proximity       False
dtype: bool

In [210]:
#CHecking how many null values
x_feature.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
dtype: int64

In [211]:
#Filling total_bedrooms with the mean of total_bedrooms
house_data.total_bedrooms.fillna(np.mean, inplace = True)
house_data.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
ocean_proximity       False
median_house_value    False
dtype: bool

In [212]:
#Convert categorical column in the dataset to numerical data.
#We see ocean_proximity is categorical data
house_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280,565,259,3.8462,NEAR BAY,342200


In [213]:
#So to convert ocean_proximity, we will Pandas dummy variable then we will concat this data with the other features
house_data.groupby('ocean_proximity').size()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
ISLAND           5
NEAR BAY      2290
NEAR OCEAN    2658
dtype: int64

In [214]:
x = x_feature['ocean_proximity'].str.get_dummies()
x_features = pd.concat([x_feature.drop(['ocean_proximity'], axis =1), x], axis = 1)

In [215]:
x_features.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41,880,129,322,126,8.3252,0,0,0,1,0
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,0,0,0,1,0
2,-122.24,37.85,52,1467,190,496,177,7.2574,0,0,0,1,0
3,-122.25,37.85,52,1274,235,558,219,5.6431,0,0,0,1,0
4,-122.25,37.85,52,1627,280,565,259,3.8462,0,0,0,1,0


In [216]:
x_features['total_bedrooms'] = pd.to_numeric(x_features['total_bedrooms'], errors='coerce')
x_features.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
<1H OCEAN               int64
INLAND                  int64
ISLAND                  int64
NEAR BAY                int64
NEAR OCEAN              int64
dtype: object

In [217]:
#longitude and latitude will not help our model so we will drop them
x_features.drop(['longitude','latitude'], axis = 'columns', inplace = True)

In [218]:
x_features.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41,880,129.0,322,126,8.3252,0,0,0,1,0
1,21,7099,1106.0,2401,1138,8.3014,0,0,0,1,0
2,52,1467,190.0,496,177,7.2574,0,0,0,1,0
3,52,1274,235.0,558,219,5.6431,0,0,0,1,0
4,52,1627,280.0,565,259,3.8462,0,0,0,1,0


In [219]:
#Split the data into 80% training dataset and 20% test dataset.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size = 0.2, random_state = 1)

In [220]:
#We need to change total bedroom so it is not an object. This is done on 'IN [193]'
x_train.dtypes

housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
<1H OCEAN               int64
INLAND                  int64
ISLAND                  int64
NEAR BAY                int64
NEAR OCEAN              int64
dtype: object

In [221]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [226]:
x_train.total_bedrooms.fillna(np.mean, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [224]:
model.fit(x_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [225]:
x_train

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
15961,52,1410,286.0,879,282,3.1908,0,0,0,1,0
1771,42,1485,290.0,971,303,3.6094,0,0,0,1,0
16414,16,50,10.0,20,6,2.6250,0,1,0,0,0
5056,34,5218,1576.0,3538,1371,1.5143,1,0,0,0,0
8589,38,1851,332.0,750,314,7.3356,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
10955,17,1768,474.0,1079,436,1.7823,1,0,0,0,0
17289,42,1765,263.0,753,260,8.5608,1,0,0,0,0
5192,42,1433,295.0,775,293,1.1326,1,0,0,0,0
12172,10,2381,454.0,1323,477,2.6322,0,1,0,0,0
