In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('HPI_master.csv')

# Display the first few rows of the dataframe
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Fill missing values with the mean of the column
df['index_nsa'].fillna(df['index_nsa'].mean(), inplace=True)
df['index_sa'].fillna(df['index_sa'].mean(), inplace=True)

# Check again for missing values
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of house prices
plt.figure(figsize=(10, 6))
plt.hist(df['index_nsa'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of House Prices')
plt.xlabel('House Price Index')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Calculate the correlation between the numerical features and the target variable
correlation = df.corr()['index_nsa'].sort_values(ascending=False)

# Display the correlation
correlation

In [None]:
from sklearn.model_selection import train_test_split

# Define the features and the target variable
X = df[['yr', 'index_sa']]
y = df['index_nsa']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize a Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Use the model to make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Display the evaluation metrics
mse, rmse, r2