# ***Hi its laizer and we are building a linear regression model to predict house price from california data this the first model iam building ***

# Getting data from kaggle

In [29]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("harrywang/housing")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/housing


# 🔍 Explanation:
Each of these libraries has a specific job:

pandas → To load and handle tabular data.

train_test_split → To split the data into training and test sets so we can evaluate the model fairly.

LinearRegression → The actual linear regression model.

Pipeline → Helps link multiple preprocessing and modeling steps into one clean object.

ColumnTransformer → Allows you to apply different preprocessing to different columns (e.g., numeric vs. categorical).

SimpleImputer → Fills in missing data (e.g., replacing missing values with the median).

OneHotEncoder → Converts categorical variables into numeric form using one-hot encoding.

mean_squared_error, r2_score → Metrics to evaluate model performance.

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


# Step 2: Load the Data

🔍 Explanation:
df stands for DataFrame, which is a 2D table (like Excel or SQL table).

pd.read_csv() reads a CSV (comma-separated values) file into that table.

"housing.csv" is the name of the dataset file you're working with.

In [31]:
data = pd.read_csv("/content/sample_data/housing.csv")

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [42]:
data.shape

(20640, 10)

In [46]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [44]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


# Step 3: Split Features and Target

Explanation:
This is where we tell the model what to learn from and what to predict.

X = features → everything the model will use to make predictions (input).

y = target → what we're trying to predict (output), which in this case is median_house_value.

🔧 What’s Happening:
df.drop("median_house_value", axis=1) → drops the target column from the DataFrame, leaving all the input features.

df["median_house_value"] → selects just the target column.

In [33]:
X = data.drop("median_house_value", axis = 1)
y = data["median_house_value"]

# Step 4: Identify Numerical and Categorical Features

🔍 Explanation:
Here, we're telling Python:

"Please separate columns that are numbers from columns that are categories (like text labels)."

🛠 What's Happening:
select_dtypes(include=["int64", "float64"])
→ Selects columns where values are numeric (like prices, age, counts).

select_dtypes(include=["object"])
→ Selects text/categorical columns (like "ocean_proximity").

.columns.tolist()
→ Converts column names to a nice Python list.

In [50]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# Step 5: Preprocessing Pipelines

# Build the pipelines:

🔍 What’s Going On:
🛠 SimpleImputer(strategy="median")
Fills in any missing numbers with the median value of that column.

Median is safer than mean when outliers exist (e.g., 99999 bedrooms).

🛠 OneHotEncoder(handle_unknown="ignore")
Converts categories (like "NEAR BAY", "INLAND") into binary columns.

handle_unknown="ignore" prevents errors if new categories appear during prediction.

🛠 ColumnTransformer
Tells the model:

“Apply the numeric pipeline to numeric columns, and the categorical pipeline to the categorical ones.”

In [35]:
# Pipeline for numerical columns
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

# Pipeline for categorical columns
cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine both pipelines into one full preprocessor
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])



# step 6. Create the Full Pipeline

In [36]:
model_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", LinearRegression())
])

# step 7. Train/Test Split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# step 8. Train the Model

In [48]:
model_pipeline.fit(X_train, y_train)

# Step 9 Evaluate the Model

In [51]:
y_pred = model_pipeline.predict(X_test)
y_pred

array([ 54261.02768978, 124430.91772798, 255694.95828245, ...,
       439180.98341183, 120797.55240622, 183386.04993586])

In [52]:
y_pred = model_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 4908290571.346422
R^2 Score: 0.6254382675296274
