In [1]:
import os
import urllib
import tarfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Define constants
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Function to fetch data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    with tarfile.open(tgz_path) as housing_tgz:
        housing_tgz.extractall(path=housing_path)

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
fetch_housing_data()

# Load data
def load_csv(housing=HOUSING_PATH):
    csv_file = os.path.join(housing, "housing.csv")
    return pd.read_csv(csv_file)

housing = load_csv()

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
from sklearn.model_selection import train_test_split
x=housing.drop('median_house_value',axis=1)
y=housing["median_house_value"]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [5]:
x_test.shape

(4128, 9)

In [6]:
x_train.shape

(16512, 9)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

cat_pipeline=Pipeline([
    ("impute",SimpleImputer(strategy="most_frequent")),
    ("convert",OneHotEncoder(sparse_output=False)),
    ("scaler",StandardScaler())
    
])
num_pipeline=Pipeline([
    ("imputer",SimpleImputer(strategy="mean")),
    ("scaler",StandardScaler())
])

In [8]:
from sklearn.compose import ColumnTransformer
cat_columns=["ocean_proximity"]
num_columns=list(x_train.drop('ocean_proximity',axis=1))

process=ColumnTransformer([
    ("cat",cat_pipeline,cat_columns),
    ("num",num_pipeline,num_columns)
])

In [9]:
from sklearn.linear_model import LinearRegression
full_pipeline=Pipeline([
    ("process",process),
    ("model",LinearRegression())
])

In [10]:
full_pipeline.fit(x_train,y_train)

In [11]:
y_pre=full_pipeline.predict(x_test)

In [12]:
from sklearn.metrics import r2_score
r2_score=r2_score(y_pre,y_test)

In [13]:
r2_score

0.44107906184221557