In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

#### Link to the Dataset: https://www.kaggle.com/datasets/prokshitha/home-value-insights/data

# Load the dataset

In [88]:
df=pd.read_csv('house_price_regression_dataset.csv')

# Understand the dataset

In [89]:
df.shape

In [90]:
df.head()

In [91]:
df.info()

In [92]:
df.describe()

In [93]:
df.isnull().sum()

In [94]:
df.duplicated().sum()

In [95]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

train_sizes = [0.5, 0.75]  
test_size = 0.3
random_seed = 42

# Train Test Split

In [96]:
target = 'House_Price'
features = [col for col in df.columns if col != target]

In [97]:
X = df[features]
y = df[target]

In [98]:
def evaluate_model(X, y, test_size, random_seed, dataset_fraction):
    # Reduce dataset size if needed
    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=dataset_fraction, random_state=random_seed)

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=test_size,
                                                        random_state=random_seed)

    # Train the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print evaluation results
    print(f"\nTrain Size: {dataset_fraction * 100}% | Test Size: {test_size * 100}% | Random Seed: {random_seed}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R² Score: {r2:.4f}")

    return mse, r2

# Evaluate the model

In [99]:
train_test_splits = [0.3, 0.2, 0.1]  
random_seeds = [42, 7, 21]  
dataset_sizes = [0.25, 0.5, 0.75, 1.0]  

In [100]:
results = []

for test_size in train_test_splits:
    for seed in random_seeds:
        for size in dataset_sizes:
            mse, r2 = evaluate_model(X, y, test_size, seed, size)
            results.append((size, test_size, seed, mse, r2))

Run last two cells seperately to get the results

In [None]:
results_df = pd.DataFrame(results, columns=['Dataset Size', 'Test Size', 'Random Seed', 'MSE', 'R2 Score'])
print("\nFinal Results Summary:\n", results_df)


In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=results_df, x="Dataset Size", y="R2 Score", hue="Test Size", marker="o")
plt.title("Effect of Dataset Size on Model Performance")
plt.xlabel("Dataset Size")
plt.ylabel("R² Score")
plt.legend(title="Test Size")
plt.grid(True)
plt.show()