In [11]:
#  Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

#  Load the dataset
df = pd.read_csv("house price data.csv")

#  Remove duplicate rows
df.drop_duplicates(inplace=True)

#  Remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

#  Drop rows with missing values
df.dropna(inplace=True)

#  Drop non-numeric and non-useful columns (keep only numeric)
df = df.select_dtypes(include=[np.number])

#  print column names to confirm
print(" Numeric Columns Used for Prediction:")
print(df.columns.tolist())

#  Check basic info and missing values
print("\n Dataset Info:")
print(df.info())

print("\n Missing Values:")
print(df.isnull().sum())

# Drop rows with missing values
df.dropna(inplace=True)

#  Define features and target
# Replace 'price' with the actual target column if different
target_column = 'price'
if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in dataset.")

X = df.drop(columns=[target_column])
y = df[target_column]

#  Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

#  Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n Model Evaluation:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}") 


 Numeric Columns Used for Prediction:
['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          4600 non-null   float64
 1   bedrooms       4600 non-null   float64
 2   bathrooms      4600 non-null   float64
 3   sqft_living    4600 non-null   int64  
 4   sqft_lot       4600 non-null   int64  
 5   floors         4600 non-null   float64
 6   waterfront     4600 non-null   int64  
 7   view           4600 non-null   int64  
 8   condition      4600 non-null   int64  
 9   sqft_above     4600 non-null   int64  
 10  sqft_basement  4600 non-null   int64  
 11  yr_built       4600 non-null   int64  
 12  yr_renovated   4600 non-null   int64  
dtypes: float