In [54]:
import os
import urllib
import tarfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define constants
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Function to fetch data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    with tarfile.open(tgz_path) as housing_tgz:
        housing_tgz.extractall(path=housing_path)

fetch_housing_data()

# Load data
def load_csv(housing=HOUSING_PATH):
    csv_file = os.path.join(housing, "housing.csv")
    return pd.read_csv(csv_file)

housing = load_csv()

# Split data into features and target
x = housing.drop('median_house_value', axis=1)
y = housing["median_house_value"]

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define preprocessing pipelines
cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("convert", OneHotEncoder(sparse_output=False)),
    ("scaler", StandardScaler())
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# List of categorical and numerical columns
cat_columns = ["ocean_proximity"]
num_columns = list(x_train.drop('ocean_proximity', axis=1).columns)

# Combine preprocessing pipelines into a single column transformer
process = ColumnTransformer([
    ("cat", cat_pipeline, cat_columns),
    ("num", num_pipeline, num_columns)
])

# Create the full pipeline with preprocessing and model
full_pipeline = Pipeline([
    ("process", process),
    ("model", LinearRegression())
])

# Fit the model
full_pipeline.fit(x_train, y_train)

# Make predictions
y_predict = full_pipeline.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_predict)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 4902581952.148009


In [55]:
from sklearn.metrics import r2_score
r2_score(y_test, y_predict)

0.6258739039830709