In [2]:
import pandas as pd
import numpy as np

# Step 1: Load the Dataset
file_path = '/content/melbourne_housing_raw.csv'
melbourne_data = pd.read_csv(file_path)

# Step 2: Calculate Correlation Matrix for Numeric Columns Only
numeric_data = melbourne_data.select_dtypes(include=[np.number])  # Select only numeric columns
correlation_matrix = numeric_data.corr().abs()  # Calculate absolute correlation values

# Display the correlation matrix
print("Correlation Matrix (absolute values):")
print(correlation_matrix.to_string())  # Use to_string() for better formatting in console output

# Step 3: Identify Highly Correlated Features
# Select upper triangle of the correlation matrix
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.85
high_corr_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]

# Print the highly correlated features that will be removed
print("\nHighly correlated features with correlation > 0.85 that are removed:")
for feature in high_corr_features:
    print(feature)

# Step 4: Remove the Highly Correlated Features
filtered_data = melbourne_data.drop(columns=high_corr_features)

# Display the shape of the new filtered dataset
print("\nShape of the dataset after removing highly correlated features:", filtered_data.shape)


Correlation Matrix (absolute values):
                  Rooms     Price  Distance  Postcode  Bedroom2  Bathroom       Car  Landsize  BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount
Rooms          1.000000  0.465238  0.271511  0.085890  0.946755  0.611826  0.393878  0.037402      0.156229   0.012749   0.004872    0.103235       0.071677
Price          0.465238  1.000000  0.211384  0.044950  0.430275  0.429878  0.201803  0.032748      0.100754   0.333306   0.215607    0.197874       0.059017
Distance       0.271511  0.211384  1.000000  0.481566  0.269524  0.126201  0.241835  0.060862      0.076301   0.323059   0.100417    0.200946       0.018140
Postcode       0.085890  0.044950  0.481566  1.000000  0.089292  0.120080  0.067886  0.040664      0.042437   0.089805   0.231027    0.362895       0.017108
Bedroom2       0.946755  0.430275  0.269524  0.089292  1.000000  0.614892  0.388491  0.037019      0.154157   0.002022   0.003447    0.106164       0.053451
Bathroom       0.611