In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load the dataset

In [6]:
try:
    data = pd.read_csv('data.csv')
    print("Dataset loaded successfully.")
    print("First 5 rows of the dataset:")
    print(data.head())
    print("\nDataset Info:")
    data.info()
except FileNotFoundError:
    print("Error: '4) house Prediction Data Set.csv' not found. Please ensure the file is correctly uploaded.")
    exit()

Dataset loaded successfully.
First 5 rows of the dataset:
  0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00
0   0.02731   0.00   7.070  0  0.4690  6.4210  78...                                             
1   0.02729   0.00   7.070  0  0.4690  7.1850  61...                                             
2   0.03237   0.00   2.180  0  0.4580  6.9980  45...                                             
3   0.06905   0.00   2.180  0  0.4580  7.1470  54...                                             
4   0.02985   0.00   2.180  0  0.4580  6.4300  58...                                             

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 1 columns):
 #   Column                                                                                            Non-Null Count  Dtype 
---  ------                                                                                            ------

# 2. Preprocess the data

In [7]:
target_column = None
possible_price_columns = ['Price', 'SalePrice', 'HousePrice', 'price']
for col in possible_price_columns:
    if col in data.columns:
        target_column = col
        break

In [8]:
if target_column is None:
    print("\nCould not automatically identify the target 'Price' column.")
    print("Please specify the target column from the available columns:")
    print(data.columns.tolist())
    # As a fallback, let's try to use the last column as target if 'Price' isn't found
    # This is a heuristic and might need manual adjustment by the user.
    if data.shape[1] > 1:
        target_column = data.columns[-1]
        print(f"Using '{target_column}' as the target column as a fallback.")
    else:
        print("Dataset has only one column or no suitable target column found. Cannot proceed.")
        exit()


Could not automatically identify the target 'Price' column.
Please specify the target column from the available columns:
[' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00']
Dataset has only one column or no suitable target column found. Cannot proceed.


In [9]:
initial_rows = data.shape[0]
data.dropna(inplace=True)
rows_after_dropna = data.shape[0]
if initial_rows > rows_after_dropna:
    print(f"\nDropped {initial_rows - rows_after_dropna} rows with missing values.")

In [10]:
features = [col for col in data.columns if col != target_column]

In [11]:
numeric_features = data[features].select_dtypes(include=np.number).columns.tolist()
if len(numeric_features) != len(features):
    print("\nWarning: Some non-numeric features were excluded. Only numeric features will be used.")
    print(f"Original features: {features}")
    print(f"Numeric features used: {numeric_features}")


Original features: [' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00']
Numeric features used: []


In [12]:
features = numeric_features

In [13]:
if not features:
    print("Error: No numeric features found to train the model. Please check your dataset.")
    exit()

Error: No numeric features found to train the model. Please check your dataset.


In [None]:
X = data[features]
y = data[target_column]

KeyError: None

: 

In [None]:
print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f"\nTraining data size: {X_train.shape[0]} samples")
print(f"Testing data size: {X_test.shape[0]} samples")

# 3. Train a linear regression model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
print("\nLinear Regression model trained successfully.")

# 4. Interpret the model coefficients

In [None]:
print("\n--- Model Coefficients ---")
print(f"Intercept: {model.intercept_:.2f}")

In [None]:
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)
print("\nInterpretation of Coefficients:")
print("For each unit increase in a feature, holding other features constant,")
print("the target variable (Price) is expected to change by the value of its coefficient.")

# 5. Evaluate the model

In [None]:
y_pred = model.predict(X_test)

r_squared = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse

In [None]:
print("\n--- Model Evaluation ---")
print(f"R-squared (R²): {r_squared:.4f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

In [None]:
print("\nInterpretation of Evaluation Metrics:")
print(f"R-squared (R²): {r_squared:.4f} indicates the proportion of the variance in the dependent variable (Price)")
print("that is predictable from the independent variables (features). A higher R² value (closer to 1) indicates a better fit.")
print(f"Mean Squared Error (MSE): {mse:.2f} is the average of the squared differences between the predicted and actual values.")
print("It measures the average magnitude of the errors. Lower MSE indicates better accuracy.")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} is the square root of MSE and is in the same units as the target variable,")
print("making it more interpretable than MSE.")