In [2]:
import pandas as pd
import numpy as np
import sklearn 


In [3]:
# Online datasets joylashgan manzilni korsatamiz
url = 'https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true'
df = pd.read_csv(url)


In [4]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=45)

X_train = train_set.drop('median_house_value', axis=1)
y = train_set['median_house_value'].copy()

X_num = X_train.drop('ocean_proximity', axis=1)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

num_pipeline.fit_transform(X_num)

array([[ 0.94382381, -0.70482757, -1.08618132, ...,  0.48780807,
        -0.05917339, -0.89171157],
       [-0.96871548,  1.35954492,  0.66045582, ...,  0.04334566,
        -0.07066601, -0.40625808],
       [-0.84886968,  1.23783588, -0.68921833, ...,  0.2211216 ,
         0.01691036, -0.6163858 ],
       ...,
       [-0.8838247 ,  1.41571832, -0.37164794, ..., -0.21236402,
        -0.09138373, -0.0657545 ],
       [ 0.88889449, -0.72823316, -0.84800353, ..., -0.27762707,
         0.06623893, -0.00187635],
       [ 0.72410651, -0.66737864,  0.58106323, ...,  0.6187937 ,
        -0.0326693 , -1.03207012]])

In [7]:
from sklearn.compose import  ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [8]:
X_prepared = full_pipeline.fit_transform(X_train)

In [9]:
X_prepared

array([[ 0.94382381, -0.70482757, -1.08618132, ...,  0.        ,
         0.        ,  0.        ],
       [-0.96871548,  1.35954492,  0.66045582, ...,  0.        ,
         0.        ,  0.        ],
       [-0.84886968,  1.23783588, -0.68921833, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.8838247 ,  1.41571832, -0.37164794, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.88889449, -0.72823316, -0.84800353, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.72410651, -0.66737864,  0.58106323, ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

LR_model.fit(X_prepared, y)

In [11]:
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
6090,-117.87,34.1,15.0,6409.0,1363.0,3359.0,1267.0,3.875,<1H OCEAN
9902,-122.27,38.28,37.0,1170.0,303.0,766.0,302.0,2.6618,NEAR BAY
2566,-124.16,40.78,46.0,1975.0,346.0,791.0,349.0,3.8,NEAR OCEAN
7789,-118.07,33.89,29.0,1138.0,217.0,964.0,222.0,4.537,<1H OCEAN
6390,-118.05,34.15,32.0,5131.0,665.0,1877.0,622.0,8.2004,INLAND
17792,-121.82,37.36,33.0,1624.0,337.0,1412.0,323.0,4.0385,<1H OCEAN
10832,-117.93,33.65,34.0,2141.0,425.0,1559.0,429.0,4.2036,<1H OCEAN
13213,-117.71,34.04,17.0,4098.0,733.0,1859.0,713.0,2.9811,INLAND
6858,-118.14,34.05,25.0,5478.0,1136.0,3062.0,1096.0,3.4118,<1H OCEAN
14659,-117.13,32.81,19.0,2157.0,554.0,1349.0,535.0,2.7652,NEAR OCEAN


In [12]:
test_label = y.loc[test_data.index]
test_label

6090     173300.0
9902     136200.0
2566      81800.0
7789     185300.0
6390     500001.0
17792    167600.0
10832    220100.0
13213    231800.0
6858     341100.0
14659    177400.0
Name: median_house_value, dtype: float64

In [13]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 0.84395231, -0.71418981, -1.08618132,  1.72101111,  1.96644953,
         1.74676986,  2.01222088,  0.0049147 , -0.14546867, -0.03806168,
        -0.02021117,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.35322077,  1.242517  ,  0.66045582, -0.6673608 , -0.55753307,
        -0.5918445 , -0.51646489, -0.63192317, -0.6065255 , -0.04857244,
         0.69882388,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-2.29700648,  2.41279619,  1.3749892 , -0.30037485, -0.4551451 ,
        -0.56929712, -0.3933061 , -0.0344546 ,  0.08837034, -0.07330523,
        -0.60221064,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.74408081, -0.81249326,  0.02531505, -0.68194906, -0.76230902,
        -0.41326924, -0.72609687,  0.35241443, -0.11910312,  0.11688673,
        -0.36159054,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.75406796, -0.69078422,  0

In [14]:
pridect_label = LR_model.predict(test_data_prepared)

In [15]:
pridect_label

array([220906.16375101, 184420.99410733, 229789.05430961, 225784.35629695,
       341364.39754211, 218971.03823908, 223951.10273392, 131036.11746586,
       208965.92691711, 188329.97252928])

In [16]:
test_label

6090     173300.0
9902     136200.0
2566      81800.0
7789     185300.0
6390     500001.0
17792    167600.0
10832    220100.0
13213    231800.0
6858     341100.0
14659    177400.0
Name: median_house_value, dtype: float64

In [17]:
pd.DataFrame({'Bashorat':pridect_label, 'Real bahosi': test_label})

Unnamed: 0,Bashorat,Real bahosi
6090,220906.163751,173300.0
9902,184420.994107,136200.0
2566,229789.05431,81800.0
7789,225784.356297,185300.0
6390,341364.397542,500001.0
17792,218971.038239,167600.0
10832,223951.102734,220100.0
13213,131036.117466,231800.0
6858,208965.926917,341100.0
14659,188329.972529,177400.0


# Modelni baholash

In [18]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
3046,-119.28,35.50,34.0,1923.0,379.0,1101.0,351.0,2.4044,65800.0,INLAND
6803,-118.13,34.08,40.0,1931.0,449.0,1367.0,446.0,2.5750,228400.0,<1H OCEAN
10112,-117.94,33.94,30.0,1596.0,307.0,845.0,309.0,4.5096,241100.0,<1H OCEAN
3126,-117.79,35.21,4.0,2.0,2.0,6.0,2.0,2.3750,137500.0,INLAND
5008,-118.33,34.01,47.0,1320.0,259.0,653.0,291.0,3.7727,193000.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15021,-117.03,32.79,17.0,7352.0,1699.0,3331.0,1634.0,2.7006,166300.0,<1H OCEAN
6658,-118.12,34.15,22.0,1671.0,480.0,1005.0,443.0,3.0119,171400.0,<1H OCEAN
18945,-122.03,38.28,15.0,5114.0,833.0,2418.0,778.0,4.4882,144000.0,INLAND
2499,-120.39,36.78,11.0,1947.0,488.0,2104.0,486.0,1.7184,55200.0,INLAND


In [19]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
3046,-119.28,35.50,34.0,1923.0,379.0,1101.0,351.0,2.4044,INLAND
6803,-118.13,34.08,40.0,1931.0,449.0,1367.0,446.0,2.5750,<1H OCEAN
10112,-117.94,33.94,30.0,1596.0,307.0,845.0,309.0,4.5096,<1H OCEAN
3126,-117.79,35.21,4.0,2.0,2.0,6.0,2.0,2.3750,INLAND
5008,-118.33,34.01,47.0,1320.0,259.0,653.0,291.0,3.7727,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15021,-117.03,32.79,17.0,7352.0,1699.0,3331.0,1634.0,2.7006,<1H OCEAN
6658,-118.12,34.15,22.0,1671.0,480.0,1005.0,443.0,3.0119,<1H OCEAN
18945,-122.03,38.28,15.0,5114.0,833.0,2418.0,778.0,4.4882,INLAND
2499,-120.39,36.78,11.0,1947.0,488.0,2104.0,486.0,1.7184,INLAND


In [20]:
y_test = test_set['median_house_value'].copy()
y_test

3046      65800.0
6803     228400.0
10112    241100.0
3126     137500.0
5008     193000.0
           ...   
15021    166300.0
6658     171400.0
18945    144000.0
2499      55200.0
19408    170200.0
Name: median_house_value, Length: 4128, dtype: float64

In [21]:
X_test_preparad = full_pipeline.transform(X_test)

In [22]:
y_pridect = LR_model.predict(X_test_preparad)

In [23]:
y_pridect

array([115267.59726087, 175973.82136351, 239641.41285717, ...,
       178464.64496685,  38381.87901367,  97809.6576567 ])

In [24]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pridect)

print('MAE', mae)

MAE 49894.378963147305


In [25]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pridect)

print('RMSE', np.sqrt(mse))

RMSE 69964.42757249424


# Random forest

In [26]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()

RF_model.fit(X_prepared, y)

In [27]:
y_rf_pridect = RF_model.predict(X_test_preparad)

In [28]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_rf_pridect)

print('RMSE', np.sqrt(mse))

RMSE 48492.84483401892


# Cross Validations

In [29]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value'].copy()

X_prepared = full_pipeline.transform(X)


In [30]:
from sklearn.model_selection import cross_val_score

mse_scores = cross_val_score(LR_model, X_prepared, y, scoring='neg_mean_squared_error', cv=5)

In [31]:
def display_scores(scores):
  print('Scores', scores)
  print('Mean', scores.mean())
  print('Std.dev', scores.std())

In [32]:
display_scores(np.sqrt(-mse_scores))

Scores [73391.42036892 74809.28332317 75429.91837496 76604.35506436
 66196.72436926]
Mean 73286.34030013604
Std.dev 3693.161254481327


In [33]:
scores = cross_val_score(RF_model, X_prepared, y, scoring='neg_mean_squared_error', cv=10)
RF_rmse_scores = np.sqrt(-scores)
display_scores(RF_rmse_scores)

Scores [96919.95718187 47165.98294631 65263.42745586 56359.07559544
 60432.57206007 59876.51853613 46719.20542726 79388.93966461
 73984.49576001 49408.30109524]
Mean 63551.84757228043
Std.dev 15175.183948776186


# pickle

In [35]:
import pickle

# faylga yozish 
filename = 'RF_model.pkl' # faylga istalgan nom beramiz
with open(filename, 'wb') as file:
  pickle.dump(RF_model, file)

In [36]:
# fayldan o`qish 
with open(filename, 'rb') as file:
  model = pickle.load(file)

# joblib

In [37]:
import joblib

# faylga yozish
filename = 'LR_model' # faylga ihtiyoriy nom beramiz
joblib.dump(LR_model, filename)

['LR_model']

In [39]:
# fayldan o`qish
model = joblib.load(filename)

In [40]:
scores = cross_val_score(model, X_prepared, y, scoring='neg_mean_squared_error', cv=10)
RF_rmse_scores = np.sqrt(-scores)
display_scores(RF_rmse_scores)

Scores [84183.66301514 61191.52853899 86743.60959739 62286.73445075
 80537.25795828 68918.58661112 52504.86407192 90904.22793667
 77675.08903006 53940.95369716]
Mean 71888.65149074615
Std.dev 13247.671855830777
