<a href="https://colab.research.google.com/github/kundanpanday/PRODIGY_ML_01/blob/main/House_Price_Prediction_using_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# --- 1. Generate a Synthetic Dataset ---
# This dataset simulates house features (square footage, bedrooms, bathrooms)
# and their corresponding prices based on a linear relationship with some noise.

np.random.seed(42) # Set a seed for reproducibility

n_samples = 1000 # Number of synthetic house samples

# Generate features:
# Square footage (normally distributed around 2000 sq ft)
sq_footage = np.random.normal(2000, 500, n_samples).astype(int)
# Number of bedrooms (random integer between 1 and 5)
bedrooms = np.random.randint(1, 6, n_samples)
# Number of bathrooms (random integer between 1 and 3)
bathrooms = np.random.randint(1, 4, n_samples)

# Define coefficients for the linear relationship and an intercept
# These values determine how much each feature contributes to the price.
price_coeff_sq_footage = 150    # Price per square foot
price_coeff_bedrooms = 20000    # Price per bedroom
price_coeff_bathrooms = 15000   # Price per bathroom
intercept = 50000               # Base price
noise = np.random.normal(0, 30000, n_samples) # Add random noise to simulate real-world variability

# Calculate prices based on the linear model and noise
prices = (sq_footage * price_coeff_sq_footage +
          bedrooms * price_coeff_bedrooms +
          bathrooms * price_coeff_bathrooms +
          intercept + noise).astype(int)

# Ensure no negative prices (set a minimum if calculated price is too low)
prices[prices < 0] = 50000

# Create a Pandas DataFrame from the generated data
data = pd.DataFrame({
    'SquareFootage': sq_footage,
    'Bedrooms': bedrooms,
    'Bathrooms': bathrooms,
    'Price': prices
})

print("--- Synthetic Dataset Head ---")
print(data.head())
print("\n--- Synthetic Dataset Description ---")
print(data.describe())

# --- 2. Prepare the Data (Features X and Target y) ---
# X will contain our independent variables (features), and y will be our dependent variable (target).

X = data[['SquareFootage', 'Bedrooms', 'Bathrooms']] # Features
y = data['Price'] # Target variable (house price)

print("\n--- Features (X) Head ---")
print(X.head())
print("\n--- Target (y) Head ---")
print(y.head())

# --- 3. Split the Data into Training and Testing Sets ---
# We split the data to train the model on one portion and evaluate its performance
# on unseen data, ensuring it generalizes well.
# test_size=0.2 means 20% of the data will be used for testing.
# random_state ensures reproducibility of the split.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# --- 4. Implement and Train the Linear Regression Model ---
# Initialize the LinearRegression model from scikit-learn and train it using the training data.

model = LinearRegression() # Create an instance of the Linear Regression model
model.fit(X_train, y_train) # Train the model using the training features and target

print("\n--- Model Training Complete ---")
# Print the learned intercept (base price when all features are zero)
print(f"Model Intercept: ${model.intercept_:.2f}")
# Print the learned coefficients for each feature (how much each unit change affects price)
print(f"Model Coefficients (Square Footage, Bedrooms, Bathrooms): {model.coef_}")

# --- 5. Evaluate the Model ---
# Predict prices on the test set and evaluate the model's performance using MAE and R-squared.

y_pred = model.predict(X_test) # Make predictions on the unseen test data

# Calculate Mean Absolute Error (MAE): Average absolute difference between predicted and actual prices.
mae = mean_absolute_error(y_test, y_pred)
# Calculate R-squared (R2) Score: Proportion of variance in the dependent variable predictable from independent variables.
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation on Test Set ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}") # Lower MAE is better
print(f"R-squared (R2) Score: {r2:.4f}") # Higher R2 (closer to 1) is better

# --- 6. Function for User Prediction ---
# This function allows you to input new house details and get a price prediction.

def predict_house_price(sq_ft, num_bedrooms, num_bathrooms):
    """
    Predicts the price of a house based on its square footage, number of bedrooms, and bathrooms.

    Args:
        sq_ft (int): The square footage of the house.
        num_bedrooms (int): The number of bedrooms.
        num_bathrooms (int): The number of bathrooms.

    Returns:
        float: The predicted price of the house.
    """
    # Create a DataFrame for the input, ensuring column names match the training data
    # This is crucial for the model to correctly interpret the input features.
    input_data = pd.DataFrame([[sq_ft, num_bedrooms, num_bathrooms]],
                              columns=['SquareFootage', 'Bedrooms', 'Bathrooms'])

    predicted_price = model.predict(input_data)[0] # Predict and extract the single predicted value
    return predicted_price

# --- Example Usage of the Prediction Function ---
# Since direct interactive input (e.g., `input()`) is not supported in this environment,
# we'll demonstrate with hardcoded example values.

print("\n--- Example House Price Predictions (Non-Interactive) ---")

# Example 1: A moderate-sized house
sq_ft_1, bedrooms_1, bathrooms_1 = 2000, 3, 2
predicted_price_1 = predict_house_price(sq_ft_1, bedrooms_1, bathrooms_1)
print(f"House 1 (SqFt: {sq_ft_1}, Beds: {bedrooms_1}, Baths: {bathrooms_1}): ${predicted_price_1:,.2f}")

# Example 2: A larger house with more amenities
sq_ft_2, bedrooms_2, bathrooms_2 = 2800, 4, 3
predicted_price_2 = predict_house_price(sq_ft_2, bedrooms_2, bathrooms_2)
print(f"House 2 (SqFt: {sq_ft_2}, Beds: {bedrooms_2}, Baths: {bathrooms_2}): ${predicted_price_2:,.2f}")

# Example 3: A smaller, simpler house
sq_ft_3, bedrooms_3, bathrooms_3 = 1500, 2, 1
predicted_price_3 = predict_house_price(sq_ft_3, bedrooms_3, bathrooms_3)
print(f"House 3 (SqFt: {sq_ft_3}, Beds: {bedrooms_3}, Baths: {bathrooms_3}): ${predicted_price_3:,.2f}")

# You can call `predict_house_price()` with your desired values like this:
# my_sq_ft = 2100
# my_bedrooms = 3
# my_bathrooms = 2
# my_predicted_price = predict_house_price(my_sq_ft, my_bedrooms, my_bathrooms)
# print(f"\nYour predicted house price: ${my_predicted_price:,.2f}")

--- Synthetic Dataset Head ---
   SquareFootage  Bedrooms  Bathrooms   Price
0           2248         4          3  509177
1           1930         1          3  388183
2           2323         3          1  504517
3           2761         5          3  585579
4           1882         3          3  450968

--- Synthetic Dataset Description ---
       SquareFootage     Bedrooms    Bathrooms          Price
count    1000.000000  1000.000000  1000.000000    1000.000000
mean     2009.162000     3.045000     1.985000  442929.552000
std       489.602333     1.424431     0.835148   86185.759446
min       379.000000     1.000000     1.000000  132252.000000
25%      1675.500000     2.000000     1.000000  380772.500000
50%      2012.500000     3.000000     2.000000  446277.500000
75%      2323.250000     4.000000     3.000000  501113.000000
max      3926.000000     5.000000     3.000000  765445.000000

--- Features (X) Head ---
   SquareFootage  Bedrooms  Bathrooms
0           2248         4     