In [2]:
import pandas as pd
import numpy as np

# Load the DataFrame from CSV
df = pd.read_csv('kc_house_data.csv')  # Replace 'your_dataset.csv' with the actual filename

# Define functions for linear regression, RSS, and prediction
def simple_linear_regression(input_feature, output):
    input_array = np.array(input_feature)
    output_array = np.array(output)
    
    # Calculate necessary sums
    n = len(input_array)
    sum_x = np.sum(input_array)
    sum_y = np.sum(output_array)
    sum_xy = np.sum(input_array * output_array)
    sum_x_squared = np.sum(input_array**2)
    
    # Calculate slope (beta1) using closed-form solution
    slope = (sum_xy - (sum_x * sum_y) / n) / (sum_x_squared - (sum_x**2) / n)
    
    # Calculate intercept (beta0)
    intercept = (sum_y - slope * sum_x) / n
    
    return intercept, slope
    

def calculate_rss(input_feature, output, slope, intercept):
    input_array = np.array(input_feature)
    output_array = np.array(output)
    predicted_output = intercept + slope * input_array
    residuals = output_array - predicted_output
    rss = np.sum(residuals**2)
    return rss# ... (same function as earlier)
    

def predict_output(input_feature, slope, intercept):
    input_array = np.array(input_feature)
    predicted_output = intercept + slope * input_array
    return predicted_output


# Assuming you have columns named 'sqft_living', 'bedrooms', and 'price' in your DataFrame
# Split data into train and test sets
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

# Train the model on the training data
slope_sqft, intercept_sqft = simple_linear_regression(train_data['sqft_living'], train_data['price'])
slope_bedrooms, intercept_bedrooms = simple_linear_regression(train_data['bedrooms'], train_data['price'])

# Calculate RSS on test data
rss_sqft = calculate_rss(test_data['sqft_living'], test_data['price'], slope_sqft, intercept_sqft)
rss_bedrooms = calculate_rss(test_data['bedrooms'], test_data['price'], slope_bedrooms, intercept_bedrooms)

# Compare and print the results
if rss_sqft < rss_bedrooms:
    print("Model using square feet has the lowest RSS on test data.")
else:
    print("Model using bedrooms has the lowest RSS on test data.")

# Predict the price for a house with 2650 sqft
sqft_to_predict = 2650
predicted_price_sqft = predict_output(sqft_to_predict, slope_sqft, intercept_sqft)
print("Predicted Price for a House with 2650 sqft (using sqft model):", predicted_price_sqft)

# Predict the price for a house with certain number of bedrooms
bedrooms_to_predict = 3  # Specify the number of bedrooms you want to predict
predicted_price_bedrooms = predict_output(bedrooms_to_predict, slope_bedrooms, intercept_bedrooms)
print("Predicted Price for a House with", bedrooms_to_predict, "bedrooms (using bedrooms model):", predicted_price_bedrooms)

Model using bedrooms has the lowest RSS on test data.
Predicted Price for a House with 2650 sqft (using sqft model): -141702015.55228308
Predicted Price for a House with 3 bedrooms (using bedrooms model): 516673.5508258107


In [7]:
slope_sqft, intercept_sqft = simple_linear_regression(train_data['sqft_living'], train_data['price'])
slope_bedrooms, intercept_bedrooms = simple_linear_regression(train_data['bedrooms'], train_data['price'])

# Print the calculated slopes and intercepts
print("Slope for square feet model:", slope_sqft)
print("Intercept for square feet model:", intercept_sqft)
print("Slope for bedrooms model:", slope_bedrooms)
print("Intercept for bedrooms model:", intercept_bedrooms)

# Calculate RSS on test data
rss_sqft = calculate_rss(test_data['sqft_living'], test_data['price'], slope_sqft, intercept_sqft)
rss_bedrooms = calculate_rss(test_data['bedrooms'], test_data['price'], slope_bedrooms, intercept_bedrooms)

# Compare and print the results
if rss_sqft < rss_bedrooms:
    print("Model using square feet has the lowest RSS on test data.")
else:
    print("Model using bedrooms has the lowest RSS on test data.")

# Predict the price for a house with 2650 sqft
sqft_to_predict = 2650
predicted_price_sqft = predict_output(sqft_to_predict, slope_sqft, intercept_sqft)
print("Predicted Price for a House with 2650 sqft (using sqft model):", predicted_price_sqft)

# Predict the price for a house with a certain number of bedrooms
bedrooms_to_predict = 3  # Specify the number of bedrooms you want to predict
predicted_price_bedrooms = predict_output(bedrooms_to_predict, slope_bedrooms, intercept_bedrooms)
print("Predicted Price for a House with", bedrooms_to_predict, "bedrooms (using bedrooms model):", predicted_price_bedrooms)

Slope for square feet model: -53472.56642647893
Intercept for square feet model: 285.4778860892352
Slope for bedrooms model: 131805.68353624095
Intercept for bedrooms model: 121256.50021708787
Model using bedrooms has the lowest RSS on test data.
Predicted Price for a House with 2650 sqft (using sqft model): -141702015.55228308
Predicted Price for a House with 3 bedrooms (using bedrooms model): 516673.5508258107


In [8]:

def simple_linear_regression(input_feature, output):
    input_array = np.array(input_feature)
    output_array = np.array(output)
    
    # Calculate necessary sums
    n = len(input_array)
    sum_x = np.sum(input_array)
    sum_y = np.sum(output_array)
    sum_xy = np.sum(input_array * output_array)
    sum_x_squared = np.sum(input_array**2)
    
    # Calculate slope (beta1) using closed-form solution
    slope = (sum_xy - (sum_x * sum_y) / n) / (sum_x_squared - (sum_x**2) / n)
    
    # Calculate intercept (beta0)
    intercept = (sum_y - slope * sum_x) / n
    
    return intercept, slope
    

# Given regression parameters
slope = -53472.56642647893150
intercept = 285.477886089235210000

# Given house price
house_price = 800000

# Calculate estimated square footage using the inverse regression function
estimated_sqft = (house_price - intercept) / slope

print("Estimated Square Feet for a House Costing $800,000:", estimated_sqft)

Estimated Square Feet for a House Costing $800,000: -14.955603883600066
