<a href="https://colab.research.google.com/github/khamidjonov-ds-da/California-houses-prediction/blob/main/California.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import essential libraries
import pandas as pd
import numpy as np
import sklearn

# Online dataset URL
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

# Train and test split
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# necessary columns indexes
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Pipeline
num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())
])

from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [5]:
X_prepared = full_pipeline.fit_transform(X_train)

In [6]:
# Check with data in train_set
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()

# Training model
LR_model.fit(X_prepared, y)

test_data = X_train.sample(10)
test_label = y.loc[test_data.index]

test_data_prepared = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepared)

Predictions and actual values

In [10]:
pd.DataFrame({'Predictions': predicted_labels, 'Actual values': test_label})

Unnamed: 0,Predictions,Actual values
548,252666.650355,234500.0
18778,66039.822028,98700.0
13791,121915.985901,119600.0
3297,93230.548312,60000.0
13787,143662.194914,40400.0
9541,88408.925143,81300.0
3464,184060.320082,158500.0
17567,223506.88774,216300.0
10907,126414.942929,171400.0
10133,254917.243875,221700.0


In [8]:
X_test = test_set.drop(['median_house_value'], axis = 1)
y_test = test_set['median_house_value'].copy()

In [9]:
X_test_prepared = full_pipeline.transform(X_test)
test_prediction = LR_model.predict(X_test_prepared)

test_prediction

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])