<a href="https://colab.research.google.com/github/javohir-ergashboyev/ML-projects/blob/main/Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Predicting median house price in California using linear Regression Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb

In [2]:
url = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df=pd.read_csv(url)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(df, test_size=0.2, random_state=35)

housing=train_set.drop('median_house_value',axis=1)
housing_label=train_set['median_house_value'].copy()

housing_num=housing.drop('ocean_proximity',axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
room_ix, bedroom_ix,household_ix,population_ix=3,4,6,5
class CombineAttributeAdder(BaseEstimator,TransformerMixin):
  def __init__(self, add_bedroom=True):
    self.add_bedroom=add_bedroom
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    rooms_per_households=X[:,room_ix]/X[:,household_ix]
    population_per_housholds=X[:,population_ix]/X[:,household_ix]
    if self.add_bedroom:
      bedroom_per_rooms=X[:,bedroom_ix]/X[:,room_ix]
      return np.c_[X, rooms_per_households,population_per_housholds, bedroom_per_rooms]
    else:
      return np.c_[X, rooms_per_households,population_per_housholds]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline=Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('add_attr', CombineAttributeAdder(add_bedroom=True)),
    ('std',StandardScaler())
])
num_pipeline.fit_transform(housing_num)


array([[-1.25390838,  1.10757958, -1.79230424, ...,  0.10067299,
        -0.05831117, -0.44204612],
       [ 1.31649014, -0.79863258, -1.2367069 , ..., -0.04645309,
        -0.07680567, -0.14642693],
       [ 0.65894633, -0.77989831,  0.66819829, ..., -0.6536053 ,
         0.06743175,  0.76284898],
       ...,
       [ 0.80340671, -0.54103634,  0.27134305, ...,  0.34765451,
        -0.03553681, -0.50685707],
       [-1.12937357,  0.78909696, -0.52236745, ...,  0.09768495,
         0.14776252, -0.32081214],
       [ 0.62407658, -0.67217624,  0.58882724, ...,  0.35493053,
        -0.0368153 , -0.65791088]])

In [6]:
from sklearn.compose import ColumnTransformer
num_attributes=list(housing_num)
str_attributes=['ocean_proximity']

full_pipeline=ColumnTransformer([
    ('num_val', num_pipeline,num_attributes),
    ('str_val', OneHotEncoder(), str_attributes)
])


In [7]:
housing_prapared=full_pipeline.fit_transform(housing)
housing_prapared

array([[-1.25390838,  1.10757958, -1.79230424, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.31649014, -0.79863258, -1.2367069 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.65894633, -0.77989831,  0.66819829, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.80340671, -0.54103634,  0.27134305, ...,  0.        ,
         0.        ,  0.        ],
       [-1.12937357,  0.78909696, -0.52236745, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.62407658, -0.67217624,  0.58882724, ...,  0.        ,
         0.        ,  0.        ]])

In [8]:
housing_prapared[1]

array([ 1.31649014, -0.79863258, -1.2367069 ,  2.35239297,  2.50973879,
        1.63187609,  2.52702704, -0.9522322 , -0.04645309, -0.07680567,
       -0.14642693,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ])

In [9]:
from sklearn.linear_model import LinearRegression
LR_model=LinearRegression()
LR_model.fit(housing_prapared,housing_label)

In [10]:
test_data=housing.sample(10)
test_label=housing_label.loc[test_data.index]

In [11]:
test_data_prapared=full_pipeline.transform(test_data)

In [12]:
predicted_labels=LR_model.predict(test_data_prapared)
predicted_labels

array([234253.20359404, 124440.79222833, 270381.88545106, 381761.03928966,
       671583.47037169, 175494.13293213,  89653.99532174, 119336.65285397,
        74460.81498691, 265370.67659109])

In [13]:
pd.DataFrame({'Real price':test_label,'Predicted_price':predicted_labels})

Unnamed: 0,Real price,Predicted_price
740,184000.0,234253.203594
13629,86700.0,124440.792228
16098,283200.0,270381.885451
17018,386800.0,381761.03929
8852,500001.0,671583.470372
11985,142400.0,175494.132932
20082,111300.0,89653.995322
14760,125000.0,119336.652854
20077,118800.0,74460.814987
16394,220700.0,265370.676591


In [14]:
real_test_data=test_set.drop('median_house_value',axis=1)
real_test_label=test_set['median_house_value'].copy()

In [15]:
real_test_prepared=full_pipeline.transform(real_test_data)

In [24]:
full_prediction=LR_model.predict(real_test_prepared)
full_prediction

array([307632.80205984,  67611.8985497 , 247978.14574414, ...,
       168513.66979661, 192401.46975119, 258855.47014557])

In [25]:
from sklearn.metrics import mean_absolute_error
mae=mean_absolute_error(real_test_label,full_prediction)
print("MAE: ",mae)

MAE:  49810.06572460955


In [26]:
from sklearn.metrics import mean_squared_error
mse=np.sqrt(mean_squared_error(real_test_label,full_prediction))
print('MSE: ',mse)

MSE:  69116.59360540153


In [27]:
from sklearn.ensemble import RandomForestRegressor
RF_model=RandomForestRegressor()
RF_model.fit(housing_prapared,housing_label)

In [28]:
full_prediction=RF_model.predict(real_test_prepared)
full_prediction

array([315297.02, 117010.  , 232857.  , ..., 102231.  , 193690.  ,
       219881.  ])

In [29]:
from sklearn.metrics import mean_squared_error
mse=np.sqrt(mean_squared_error(real_test_label,full_prediction))
print('MSE: ',mse)

MSE:  50883.07969004928


Final result taken from RF_model is more optimal than the result that we get from LR_model

Cross Validation

In [36]:
from sklearn.model_selection import cross_val_score
X=df.drop('median_house_value', axis=1)
Y=df['median_house_value'].copy()
X_prepared=full_pipeline.transform(X)
X_prepared

array([[-1.32364787,  1.05137676,  0.98568249, ...,  0.        ,
         1.        ,  0.        ],
       [-1.31866648,  1.04200963, -0.6017385 , ...,  0.        ,
         1.        ,  0.        ],
       [-1.32862927,  1.03732606,  1.85876404, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.82052723,  1.7773298 , -0.9192227 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.87034116,  1.7773298 , -0.83985165, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83049002,  1.74922839, -0.99859375, ...,  0.        ,
         0.        ,  0.        ]])

In [37]:
scores=cross_val_score(LR_model, X_prepared, Y, scoring='neg_mean_squared_error', cv=5)
scores

array([-5.38604201e+09, -5.59605769e+09, -5.68952025e+09, -5.86788727e+09,
       -4.38202267e+09])

In [39]:
lr_mse_score=np.sqrt(-scores)
lr_mse_score

array([73389.65874804, 74806.80242451, 75428.90861012, 76602.13620889,
       66196.84790983])

Cross Validation provides us more sufficiant error that we get from our model

Saving as file and reading from file

In [43]:
import pickle
file_name='LR_model.pkl'
with open(file_name,'wb') as f:
  pickle.dump(LR_model, f)


In [44]:
import joblib
file_name_rf='RF_model.jbl'
joblib.dump(RF_model, file_name_rf)

['RF_model.jbl']

In [45]:
with open(file_name,'rb') as f:
  model1=pickle.load(f)

In [46]:
model2=joblib.load(file_name_rf)