In [3]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing


In [4]:
# Load the California Housing dataset
housing = fetch_california_housing()

# Convert to Pandas DataFrame for easier manipulation
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# Add the target variable (house prices) to the DataFrame
df['PRICE'] = housing.target

# Display the first few rows of the dataset
print("Dataset Overview:")
print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1}")
print("\nFeature descriptions:")
for i, desc in enumerate(housing.feature_names):
    print(f"- {desc}")

df.head()





In [5]:
# Display basic statistics of the dataset
df.describe()



In [6]:
# Compute the correlation matrix
corr_matrix = df.corr()

# Display the correlation matrix
print("Correlation Matrix:")
corr_matrix





In [7]:
# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Create a mask for the upper triangle

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.title('Correlation Matrix Heatmap', fontsize=16)
plt.tight_layout()
plt.show()



### Observations from Correlation Matrix:

- Features with strong positive correlations (values close to 1) indicate that as one feature increases, the other also tends to increase.
- Features with strong negative correlations (values close to -1) indicate that as one feature increases, the other tends to decrease.
- Features with correlations close to 0 have little to no linear relationship.

In [8]:
# Create a pair plot to visualize pairwise relationships
# Note: Using a subset of features to make the plot more readable
features_to_plot = ['MedInc', 'HouseAge', 'AveRooms', 'AveOccup', 'PRICE']
plt.figure(figsize=(12, 8))
sns.pairplot(df[features_to_plot], height=2.5, diag_kind='kde', plot_kws={'alpha': 0.6, 's': 15, 'edgecolor': 'k'})
plt.suptitle('Pair Plot of California Housing Features', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()





In [None]:
# Optional: Create a more comprehensive pair plot with all features
# Warning: This may take a long time to render due to the number of features
# Uncomment the following code to run it

# plt.figure(figsize=(20, 15))
# sns.pairplot(df, height=2, plot_kws={'alpha': 0.5, 's': 10, 'edgecolor': 'k'})
# plt.suptitle('Comprehensive Pair Plot of California Housing Features', y=1.02, fontsize=16)
# plt.tight_layout()
# plt.show()

### Conclusions:

1. The correlation heatmap reveals relationships between different housing features.
2. The median income (`MedInc`) has the strongest positive correlation with housing prices.
3. The pair plot shows the distribution of each feature and the relationships between pairs of features.
4. Some features show clear linear relationships while others have more complex non-linear relationships.
5. This analysis helps understand which factors most strongly influence housing prices in California.