## Load the cleaned dataset into a pandas DataFrame.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

# Load the cleaned data
data_path = 'cleaned_data_for_modelling.csv'
cleaned_data = pd.read_csv(data_path)

# Display the first few rows of the cleaned data
cleaned_data.head()

## Feature Selection
Separate the features (independent variables) from the target variable (dependent variable).

id: Dropped because it’s a unique identifier and doesn’t contribute to prediction.
date: Dropped because it needs to be transformed if we want to use it meaningfully.
price: Dropped from features because it’s the target variable.

In [None]:
# Selecting features and target variable
features = cleaned_data.drop(columns=['id', 'date', 'price'])
target = cleaned_data['price']

## Identify Categorical Features
Identifies columns that contain categorical data.

In [None]:
categorical_features = features.select_dtypes(include=['object']).columns
categorical_features

## Encode Categorical Features
Convert categorical variables into numeric format using one-hot encoding.

In [None]:
# One-hot encoding the categorical features
features_encoded = pd.get_dummies(features, columns=categorical_features, drop_first=True)

## Split Data
Split the dataset into training and testing sets.

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)


## Train Linear Regression Model
Initialize and train the Linear Regression model using the training data.

In [None]:
# Initializing and training the Linear Regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

## Cross-Validation
Perform k-fold cross-validation on the dataset.

In [None]:
# Define the k-fold cross-validator
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Define the scoring metric
scorer = make_scorer(mean_absolute_error)

# Perform cross-validation
cv_scores = cross_val_score(linear_regressor, features_encoded, target, cv=kfold, scoring=scorer)

# Print cross-validation results
print("Cross-Validation MAE Scores:", cv_scores)
print("Mean CV MAE:", cv_scores.mean())
print("Standard Deviation CV MAE:", cv_scores.std())

## Make Predictions
Use the trained model to make predictions on the test data.

In [None]:
y_pred = linear_regressor.predict(X_test)
y_pred

## Evaluate Model
Evaluate the performance of the model using various metrics.

In [None]:
# Evaluating the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

mae, mse, rmse, r2

'mean_absolute_error' (MAE) measures the average magnitude of the errors in a set of predictions, without considering their direction.

MAE = $134,967.31757601068

'mean_squared_error' (MSE) measures the average of the squares of the errors.

MSE = $40,847,888,304.9372

'root_mean_squared_error' (RMSE) is the square root of the MSE, giving an error measure in the same units as the target variable.

RMSE = $202,108.60522238334

'r2_score' (R-squared) indicates the proportion of the variance in the dependent variable that is predictable from the independent variables.

R2 = 0.684272693492245

## Model Coefficients
DataFrame showing the relationship between each feature and the target variable.

In [None]:
# Coefficients of the linear regression model
coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': linear_regressor.coef_})
coefficients

## Residuals Analysis
Plot the distribution of residuals, helping us understand if there are any patterns in the errors.

In [None]:
# Calculate residuals
residuals = y_test - y_pred

# Plot residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.show()

## Actual vs. Predicted Values
This scatter plot shows how closely the predicted values match the actual values. The red dashed line represents the ideal fit where predicted values equal actual values.

In [None]:
# Plot actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title('Actual vs Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.show()