In [None]:
# eda.py

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# === STEP 1: LOAD CLEANED DATA ===
input_path = "../data/cleaned_data_mvg.csv"
df = pd.read_csv(input_path)
print(f"✅ Data loaded — Rows: {len(df)}, Columns: {df.shape[1]}")

# === STEP 2: BASIC INFO ===
print("\n📌 Data Types:")
print(df.dtypes)

print("\n📌 Missing Values:")
print(df.isnull().sum())

print(f"\n📌 Duplicate Rows: {df.duplicated().sum()}")

print("\n📌 First 5 rows:")
print(df.head())

# === STEP 3: DISTRIBUTION OF PRICE ===
plt.figure(figsize=(10, 6))
sns.histplot(df[df['price'] < 1_500_000]['price'], bins=50, kde=True, color='steelblue')
plt.title("Distribution of Property Prices (< €1.5M)")
plt.xlabel("Price (€)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("../figures/eda_price_distribution.png")
plt.show()

# === STEP 4: PRICE PER M² BY REGION ===
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='region', y='price_square_meter')
plt.title("Price per m² by Region")
plt.ylabel("€/m²")
plt.tight_layout()
plt.savefig("../figures/eda_price_per_m2_by_region.png")
plt.show()

# === STEP 5: CORRELATION HEATMAP (NUMERIC FEATURES) ===
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
corr = df[numeric_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Heatmap (Numerical Features)")
plt.tight_layout()
plt.savefig("../figures/eda_correlation_heatmap.png")
plt.show()

# === STEP 6: PRICE PER M² BY PROPERTY SUBTYPE ===
plt.figure(figsize=(12, 6))
subtype_avg = df.groupby('subtype')['price_square_meter'].mean().sort_values()
sns.barplot(x=subtype_avg.index, y=subtype_avg.values)
plt.xticks(rotation=90)
plt.ylabel("Average €/m²")
plt.title("Average Price per m² by Subtype")
plt.tight_layout()
plt.savefig("../figures/eda_price_per_m2_by_subtype.png")
plt.show()

# === STEP 7: APARTMENT VS HOUSE COUNT ===
plt.figure(figsize=(6, 6))
df['type'].value_counts().plot.pie(autopct='%1.1f%%', labels=['House', 'Apartment'], colors=['gold', 'skyblue'])
plt.title("Distribution of Property Types")
plt.ylabel("")
plt.tight_layout()
plt.savefig("../figures/eda_type_distribution.png")
plt.show()

print("\n📊 EDA complete. Visuals saved to ../figures/")


: 