<a href="https://colab.research.google.com/github/javohir-ergashboyev/ML-projects/blob/main/Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Predicting median house price in California using linear Regression Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb

In [4]:
url = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df=pd.read_csv(url)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(df, test_size=0.2, random_state=35)

housing=train_set.drop('median_house_value',axis=1)
housing_label=train_set['median_house_value'].copy()

housing_num=housing.drop('ocean_proximity',axis=1)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
room_ix, bedroom_ix,household_ix,population_ix=3,4,6,5
class CombineAttributeAdder(BaseEstimator,TransformerMixin):
  def __init__(self, add_bedroom=True):
    self.add_bedroom=add_bedroom
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    rooms_per_households=X[:,room_ix]/X[:,household_ix]
    population_per_housholds=X[:,population_ix]/X[:,household_ix]
    if self.add_bedroom:
      bedroom_per_rooms=X[:,bedroom_ix]/X[:,room_ix]
      return np.c_[X, rooms_per_households,population_per_housholds, bedroom_per_rooms]
    else:
      return np.c_[X, rooms_per_households,population_per_housholds]

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline=Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('add_attr', CombineAttributeAdder(add_bedroom=True)),
    ('std',StandardScaler())
])
num_pipeline.fit_transform(housing_num)


array([[-1.25390838,  1.10757958, -1.79230424, ...,  0.10067299,
        -0.05831117, -0.44204612],
       [ 1.31649014, -0.79863258, -1.2367069 , ..., -0.04645309,
        -0.07680567, -0.14642693],
       [ 0.65894633, -0.77989831,  0.66819829, ..., -0.6536053 ,
         0.06743175,  0.76284898],
       ...,
       [ 0.80340671, -0.54103634,  0.27134305, ...,  0.34765451,
        -0.03553681, -0.50685707],
       [-1.12937357,  0.78909696, -0.52236745, ...,  0.09768495,
         0.14776252, -0.32081214],
       [ 0.62407658, -0.67217624,  0.58882724, ...,  0.35493053,
        -0.0368153 , -0.65791088]])

In [12]:
from sklearn.compose import ColumnTransformer
num_attributes=list(housing_num)
str_attributes=['ocean_proximity']

full_pipeline=ColumnTransformer([
    ('num_val', num_pipeline,num_attributes),
    ('str_val', OneHotEncoder(), str_attributes)
])


In [14]:
housing_prapared=full_pipeline.fit_transform(housing)
housing_prapared

array([[-1.25390838,  1.10757958, -1.79230424, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.31649014, -0.79863258, -1.2367069 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.65894633, -0.77989831,  0.66819829, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.80340671, -0.54103634,  0.27134305, ...,  0.        ,
         0.        ,  0.        ],
       [-1.12937357,  0.78909696, -0.52236745, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.62407658, -0.67217624,  0.58882724, ...,  0.        ,
         0.        ,  0.        ]])

In [15]:
housing_prapared[1]

array([ 1.31649014, -0.79863258, -1.2367069 ,  2.35239297,  2.50973879,
        1.63187609,  2.52702704, -0.9522322 , -0.04645309, -0.07680567,
       -0.14642693,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ])

In [16]:
from sklearn.linear_model import LinearRegression
LR_model=LinearRegression()
LR_model.fit(housing_prapared,housing_label)

In [17]:
test_data=housing.sample(10)
test_label=housing_label.loc[test_data.index]

In [19]:
test_data_prapared=full_pipeline.transform(test_data)

In [20]:
predicted_labels=LR_model.predict(test_data_prapared)
predicted_labels

array([230128.28452488, 144704.62817245, 281488.13426824, 345466.13477091,
       512003.67909608, 136634.25396599, 257183.80795767, 314680.01098725,
       149695.45528553, 160076.45970778])

In [21]:
pd.DataFrame({'Real price':test_label,'Predicted_price':predicted_labels})

Unnamed: 0,Real price,Predicted_price
17594,265000.0,230128.284525
4313,165000.0,144704.628172
17980,307000.0,281488.134268
9006,407000.0,345466.134771
5269,500001.0,512003.679096
2339,123300.0,136634.253966
5827,332400.0,257183.807958
18102,417000.0,314680.010987
4499,131900.0,149695.455286
13029,147000.0,160076.459708


In [28]:
real_test_data=test_set.drop('median_house_value',axis=1)
real_test_label=test_set['median_house_value'].copy()

In [29]:
real_test_prepared=full_pipeline.transform(real_test_data)

In [30]:
full_prediction=LR_model.predict(real_test_prepared)
full_prediction

array([307632.80205984,  67611.8985497 , 247978.14574414, ...,
       168513.66979661, 192401.46975119, 258855.47014557])

In [31]:
from sklearn.metrics import mean_absolute_error
mae=mean_absolute_error(real_test_label,full_prediction)
print("MAE: ",mae)

MAE:  49810.06572460955


In [32]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(real_test_label,full_prediction)
print('MSE: ',mse)

MSE:  4777103511.614231
