Setting Up Your Environment

#install the necessary libraries using pip
pip install numpy pandas matplotlib seaborn scikit-learn

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

## https://scikit-learn.org/stable/datasets.html

# Load the dataset
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df['MedHouseVal'] = california.target  # This is the median house value, the target variable

# Display the first few rows of the dataframe
print(df.head())


In [None]:
# Exploratory Analysis

import matplotlib.pyplot as plt
import seaborn as sns

# Summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Distribution of the target variable
sns.histplot(df['MedHouseVal'], kde=True)
plt.title('Distribution of Median House Value')
plt.show()

# Correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
## Data Processing before the model

from sklearn.model_selection import train_test_split

# Features and Target
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Build the model 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


In [None]:
# Model Evaluation

# Plotting true values vs predictions
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title("True Values vs Predictions")
plt.show()


In [None]:
## Cross Validation

from sklearn.model_selection import cross_val_score

# Initialize the model
model = LinearRegression()

# Perform 10-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')

# Calculate the mean and standard deviation of the cross-validation scores
cv_mean_score = -cv_scores.mean()
cv_std_score = cv_scores.std()

print(f"Mean CV MSE: {cv_mean_score:.4f}")
print(f"CV MSE Standard Deviation: {cv_std_score:.4f}")

# Training the model on the full training data for final use
model.fit(X_train, y_train)

#Calculate R Squared

from sklearn.metrics import r2_score

# Predicting with the trained model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculating R²
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"R² on training data: {r2_train:.4f}")
print(f"R² on test data: {r2_test:.4f}")

#Adjusted R Squared

def adjusted_r2(r_squared, n, p):
    return 1 - (1-r_squared) * (n-1) / (n-p-1)

n = X_train.shape[0]  # number of observations
p = X_train.shape[1]  # number of predictors

adj_r2_train = adjusted_r2(r2_train, n, p)
adj_r2_test = adjusted_r2(r2_test, n, p)

print(f"Adjusted R² on training data: {adj_r2_train:.4f}")
print(f"Adjusted R² on test data: {adj_r2_test:.4f}")


# Mean Absolute Error (MAE) and Mean Squared Error (MSE)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate MAE and MSE
mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {mae_train:.4f}")
print(f"Train MSE: {mse_train:.4f}")
print(f"Test MAE: {mae_test:.4f}")
print(f"Test MSE: {mse_test:.4f}")


## Coefficients and Variable Importance

# Displaying the coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

print(coefficients)
