In [1]:
#importing libraries
import pandas as pd
import numpy as np

In [2]:
#loading data
housing =pd.read_csv('housing.csv')

In [3]:
#dividing data into training and test set
from sklearn.model_selection import train_test_split
train,test = train_test_split(housing,test_size=0.2,random_state=42)

In [4]:
# checking the dimensions of the training and test split
test.shape,test.shape

((4128, 10), (4128, 10))

In [5]:
# filling missing values in numericals with median values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

In [6]:
# dropping the 'ocean_proximity' column in test data
test_num = test.drop('ocean_proximity',axis =1)

In [7]:
# applying the median strategy on numerical columns
imputer.fit(test_num)

In [8]:
# computing the median for each  numerical column
imputer.statistics_

array([-1.1847e+02,  3.4230e+01,  2.9000e+01,  2.1100e+03,  4.2800e+02,
        1.1600e+03,  4.0600e+02,  3.5000e+00,  1.7865e+05])

In [9]:
# computing the median for each numerical column
test_num.median().values

array([-1.1847e+02,  3.4230e+01,  2.9000e+01,  2.1100e+03,  4.2800e+02,
        1.1600e+03,  4.0600e+02,  3.5000e+00,  1.7865e+05])

In [10]:
# tranform the numerical columns with the median strategy to fill missing values
Y = imputer.transform(test_num)

In [11]:
# converting the transformed numerical columns from a numpy array to a dataframe
test_tr = pd.DataFrame(Y,columns = test_num.columns)

In [12]:
#loading the 'ocean_proximity' into another object
test_cat = test[['ocean_proximity']]
test_cat.head()

Unnamed: 0,ocean_proximity
20046,INLAND
3024,INLAND
15663,NEAR BAY
20484,<1H OCEAN
9814,NEAR OCEAN


In [13]:
# applying OneHotEncoding on the 'ocean_proximity' column
from sklearn.preprocessing import OneHotEncoder

test_cat_encoder = OneHotEncoder()
test_cat_1hot = test_cat_encoder.fit_transform(test_cat)
test_cat_1hot


<4128x5 sparse matrix of type '<class 'numpy.float64'>'
	with 4128 stored elements in Compressed Sparse Row format>

In [14]:
# converting sparse matrix to array
test_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [15]:
# displaying the various categories of the 'ocean_proximity' column
test_cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [16]:
#Feature engineering
test['rooms_per_household'] = test['total_rooms']/test['households']
test['bedrooms_per_room'] = test['total_bedrooms']/test['total_rooms']
test['population_per_household'] = test['population']/test['households']

In [17]:
# correlation of numerical columns in test data with the 'median_house_value' column
test_corr_matrix = test.corr()
test_corr_matrix['median_house_value'].sort_values(ascending = True)

  test_corr_matrix = test.corr()


bedrooms_per_room          -0.249196
latitude                   -0.149295
population_per_household   -0.121853
longitude                  -0.044062
population                 -0.019003
total_bedrooms              0.056667
households                  0.074249
housing_median_age          0.113585
rooms_per_household         0.130928
total_rooms                 0.134697
median_income               0.677502
median_house_value          1.000000
Name: median_house_value, dtype: float64

In [18]:
# copying the 'median_house_value' column into another object
test_labels = test['median_house_value'].copy()

In [19]:
# dropping the 'median_house_value' column from the test set
test = test.drop('median_house_value',axis =1)

In [20]:
# dropping the 'ocean_proximity' column from the test data once again
test_num = test.drop('ocean_proximity',axis =1)

In [21]:
# displaying the top 5 most frequent test label values or 'median_house_value' column values
test_labels.head()

20046     47700
3024      45800
15663    500001
20484    218600
9814     278000
Name: median_house_value, dtype: int64

In [22]:
#display some information about the test data
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 20046 to 3665
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 4128 non-null   float64
 1   latitude                  4128 non-null   float64
 2   housing_median_age        4128 non-null   int64  
 3   total_rooms               4128 non-null   int64  
 4   total_bedrooms            3921 non-null   float64
 5   population                4128 non-null   int64  
 6   households                4128 non-null   int64  
 7   median_income             4128 non-null   float64
 8   ocean_proximity           4128 non-null   object 
 9   rooms_per_household       4128 non-null   float64
 10  bedrooms_per_room         3921 non-null   float64
 11  population_per_household  4128 non-null   float64
dtypes: float64(7), int64(4), object(1)
memory usage: 419.2+ KB


In [23]:
#Feature scaling of numerical columns using standandization 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


test_num_pipeline = Pipeline([('imputer',SimpleImputer(strategy ='median')),('std_scaler', StandardScaler())])

test_num_tr = test_num_pipeline.fit_transform(test_num)

In [24]:
#Combining the numerical columns back with the 'ocean_proximity' column after encoding
from sklearn.compose import ColumnTransformer
test_num_attribs = list(test_num)
test_cat_attribs = ['ocean_proximity']

test_full_pipeline = ColumnTransformer([('num',test_num_pipeline,test_num_attribs),('cat',OneHotEncoder(),
test_cat_attribs)])

test_prepared = test_full_pipeline.fit_transform(test)

In [25]:
#display dimensions of final prepared testing data without labels or output('meadian_house_value)
test_prepared.shape

(4128, 16)

In [26]:
#loading pre-trained random forest model
import pickle

model = pickle.load(open('forest_housing_model.sav','rb'))

In [27]:
model

In [28]:
# predict median_house_value based on input features from test set
model.predict(test_prepared)

array([ 67443.33333333,  95216.66666667, 389480.36666667, ...,
       472947.43333333,  93660.        , 156626.66666667])

In [29]:
# computing the root mean squared error of the random forest model on test set 
from sklearn.metrics import mean_squared_error
test_predictions = model.predict(test_prepared)

test_mse = mean_squared_error(test_labels,test_predictions)

test_rmse = np.sqrt(test_mse)

test_rmse



65365.11912472603