<h3>Import the requirements</h3>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


<p>Load the data</p>
<p>Source <a href="https://www.kaggle.com/datasets/altavish/boston-housing-dataset">https://www.kaggle.com/datasets/altavish/boston-housing-dataset</a> </p>


In [2]:
data = pd.read_csv('HousingData.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [3]:
# Check the NaN values
data.isnull().sum()


CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [4]:
# Mean Imputation
for column in data.columns:
    if data[column].isnull().sum() >0:
        data[column].fillna(data[column].mean(), inplace=True)
        
data.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [5]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,12.715432,36.2


In [6]:
data.corr()['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.695360
ZN         0.365943
B          0.333461
DIS        0.249929
CHAS       0.179882
CRIM      -0.379695
AGE       -0.380223
RAD       -0.381626
NOX       -0.427321
TAX       -0.468536
INDUS     -0.478657
PTRATIO   -0.507787
LSTAT     -0.721975
Name: MEDV, dtype: float64

In [7]:
def get_significant_cols():
    correlation_with_medv = data.corr()['MEDV'].sort_values(ascending=False)
    significant_columns = correlation_with_medv[abs(correlation_with_medv) >= 0.5].index.tolist()
    significant_columns.remove('MEDV')
    print(significant_columns)
    return significant_columns

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(data[get_significant_cols()], data['MEDV'], test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, Y_train)
prediction = model.predict(X_test)
mse = mean_squared_error(Y_test, prediction)
print("Mean Squared Error:", mse)

# Calculate the coefficient of determination (R^2)
r2 = r2_score(Y_test, prediction)
print("R-squared:", r2)

['RM', 'PTRATIO', 'LSTAT']
Mean Squared Error: 27.533016964657907
R-squared: 0.6245520716630016
