In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load the dataset

In [6]:
try:
    data = pd.read_csv('data.csv', sep='\s+', header=None)

    column_names = [
        'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'
    ]

    if data.shape[1] == len(column_names):
        data.columns = column_names
        print("Dataset loaded and Boston Housing column names assigned.")
    else:
        num_columns = data.shape[1]
        generic_column_names = [f'feature_{i}' for i in range(num_columns - 1)] + ['price']
        data.columns = generic_column_names
        print(f"Warning: Number of columns ({num_columns}) does not match typical Boston Housing dataset (14).")
        print("Assigned generic column names as a fallback.")

    print("\nFirst 5 rows of the dataset:")
    print(data.head())
    print("\nDataset Info (initial load):")
    data.info()
except FileNotFoundError:
    print("Error: '4) house Prediction Data Set.csv' not found. Please ensure the file is correctly uploaded.")
    exit()
except Exception as e:
    print(f"An error occurred during file loading: {e}")
    print("Please check the file format and delimiter. It should be space-separated.")
    exit()

Dataset loaded and Boston Housing column names assigned.

First 5 rows of the dataset:
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

Dataset Info (initial load):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non

  data = pd.read_csv('data.csv', sep='\s+', header=None)


# 2. Preprocess the data

In [7]:
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [8]:
print("\nDataset Info (after numeric conversion):")
data.info()


Dataset Info (after numeric conversion):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [9]:
initial_rows = data.shape[0]
data.dropna(inplace=True)
rows_after_dropna = data.shape[0]

In [10]:
if initial_rows > rows_after_dropna:
    print(f"\nDropped {initial_rows - rows_after_dropna} rows with missing values.")
else:
    print("\nNo missing values found after numeric conversion.")


No missing values found after numeric conversion.


In [11]:
if data.empty:
    print("Error: DataFrame is empty after dropping missing values. Cannot proceed with model training.")
    exit()

In [12]:
if 'MEDV' in data.columns:
    target_column = 'MEDV'
elif 'price' in data.columns:
    target_column = 'price'
else:
    print("Error: Could not determine target column. No 'MEDV' or 'price' column found after loading.")
    exit()

In [13]:
print(f"\nSelected target column: '{target_column}'")


Selected target column: 'MEDV'


In [14]:
features = [col for col in data.columns if col != target_column]

In [15]:
if not features:
    print("Error: No features found to train the model after preprocessing. Cannot proceed.")
    exit()

In [16]:
X = data[features]
y = data[target_column]

In [17]:
print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")


Features (X) shape: (506, 13)
Target (y) shape: (506,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(f"\nTraining data size: {X_train.shape[0]} samples")
print(f"Testing data size: {X_test.shape[0]} samples")


Training data size: 404 samples
Testing data size: 102 samples


# 3. Train a linear regression model

In [20]:
model = LinearRegression()
model.fit(X_train, y_train)
print("\nLinear Regression model trained successfully.")


Linear Regression model trained successfully.


# 4. Interpret the model coefficients

In [21]:
print("\n--- Model Coefficients ---")
print(f"Intercept: {model.intercept_:.2f}")


--- Model Coefficients ---
Intercept: 30.25


In [22]:
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)
print("\nInterpretation of Coefficients:")
print("For each unit increase in a feature, holding other features constant,")
print("the target variable (Price) is expected to change by the value of its coefficient.")

    Feature  Coefficient
0      CRIM    -0.113056
1        ZN     0.030110
2     INDUS     0.040381
3      CHAS     2.784438
4       NOX   -17.202633
5        RM     4.438835
6       AGE    -0.006296
7       DIS    -1.447865
8       RAD     0.262430
9       TAX    -0.010647
10  PTRATIO    -0.915456
11        B     0.012351
12    LSTAT    -0.508571

Interpretation of Coefficients:
For each unit increase in a feature, holding other features constant,
the target variable (Price) is expected to change by the value of its coefficient.


# 5. Evaluate the model

In [None]:
y_pred = model.predict(X_test)

In [None]:
r_squared = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

SyntaxError: incomplete input (1037571122.py, line 5)

In [None]:
print("\n--- Model Evaluation ---")
print(f"R-squared (R²): {r_squared:.4f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

In [None]:
print("\nInterpretation of Evaluation Metrics:")
print(f"R-squared (R²): {r_squared:.4f} indicates the proportion of the variance in the dependent variable ({target_column})")
print("that is predictable from the independent variables (features). A higher R² value (closer to 1) indicates a better fit.")
print(f"Mean Squared Error (MSE): {mse:.2f} is the average of the squared differences between the predicted and actual values.")
print("It measures the average magnitude of the errors. Lower MSE indicates better accuracy.")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} is the square root of MSE and is in the same units as the target variable,")
print(f"making it more interpretable than MSE (which is in units of {target_column} squared).")