# Goal & setup
**Goal:** Load data, check quality, inspect distributions/outliers, and persist clean artifacts (interim data + plots).

# 0. Kernel & imports

In [None]:
%matplotlib inline
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler


# project helpers
from config import RAW_DATA_DIR, INTERIM_DATA_DIR  # keep your current layout
from dataset import (
    load_competition_raw,
    load_original_raw,
    build_competition_tables,
    build_original_table
)

# style / display
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 120)

# Define figure output directory (relative to project root)
FIG_DIR = Path("../reports/figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)


# 1. Load datasets

In [None]:
# Competition (train/test/labels)
df_comp_train, df_comp_labels, df_comp_test = load_competition_raw(RAW_DATA_DIR)

# Original auxiliary tables (optional for cross-checking or enrichment)
df_og_damage, df_og_struct, df_og_owner = load_original_raw(RAW_DATA_DIR)

# Merge train + labels to one analysis table. Keep test set as it is.
df_comp, df_comp_test = build_competition_tables(df_comp_train, df_comp_labels, df_comp_test)

# Build merged original dataset
df_og = build_original_table(df_og_struct, df_og_owner)

df_comp.head()

# 3. Compare columns of original and competition datasets

### 3.1 Column overview

In [None]:
print("Original dataset – columns:")
print(df_og.columns.tolist())
print("\nCompetition dataset – columns:")
print(df_comp_train.columns.tolist())

### 3.2 Preview of the first rows


In [None]:
print("\nPreview of original data:")
display(df_og.head())

print("\nPreview of competition data (train):")
display(df_comp_train.head())

### 3.3 Column comparison


In [None]:
original_columns = set(df_og.columns)
competition_columns = set(df_comp_train.columns)

missing_in_competition = original_columns - competition_columns
missing_in_original = competition_columns - original_columns

print("\nColumns only in the original dataset:")
print(sorted(missing_in_competition) if missing_in_competition else "None")

print("\nColumns only in the competition dataset:")
print(sorted(missing_in_original) if missing_in_original else "None")

### 3.4 Dataset shape comparison

In [None]:

print("\nDataset shapes:")
print(f"Original: {df_og.shape[0]} rows, {df_og.shape[1]} columns")
print(f"Competition (train): {df_comp_train.shape[0]} rows, {df_comp_train.shape[1]} columns")
print(f"Competition (test): {df_comp_test.shape[0]} rows, {df_comp_test.shape[1]} columns")

# 4. Check for Missing Values


In [None]:
def check_missing_values(df: pd.DataFrame, name: str) -> pd.Series:
    """
    Display the number of missing values per column for a given DataFrame.
    
    Args:
        df (pd.DataFrame): Dataset to check.
        name (str): Name of the dataset (for labeling).
    """
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    
    if missing.empty:
        print(f"No missing values found in {name}.")
    else:
        print(f"Missing values in {name}:")
        display(missing)


In [None]:
# Check both datasets
check_missing_values(df_og, "Original dataset")
check_missing_values(df_comp_train, "Competition train dataset")

# 5. Check for Duplicates

### 5.1 Competition dataset

In [None]:
# Check for duplicates
print("Before:", df_comp_train.shape)
print("Number of duplicates:", df_comp_train.duplicated().sum())

# Remove duplicates
df_comp_train = df_comp_train.drop_duplicates()

print("After:", df_comp_train.shape)

### 5.2 Original Dataset

In [None]:
# Check for duplicates
print("Before:", df_og.shape)
print("Number of duplicates:", df_og.duplicated().sum())

# Remove duplicates
df_og = df_og.drop_duplicates()

print("After:", df_og.shape)

# 6. Outlier Check

### 6.1 Check outliers in numeric features

In [None]:
# Select numeric columns
numeric_columns = [
    "geo_level_1_id",
    "geo_level_2_id",
    "geo_level_3_id",
    "age",
    "area_percentage",
    "height_percentage",
    "count_floors_pre_eq",
    "count_families",
]

# Normalize numeric data
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(
    scaler.fit_transform(df_comp_train[numeric_columns]),
    columns=numeric_columns
)

# Plot boxplot
sns.set(style="whitegrid")
plt.figure(figsize=(14, 6))
sns.boxplot(data=df_normalized, orient="h")
plt.title("Normalized Numeric Attributes — Outlier Check", fontsize=14)
plt.xlabel("Normalized Value")
plt.ylabel("Features")
plt.tight_layout()

# Save figure
fig_path = FIG_DIR / "boxplot_numeric_features.png"
plt.savefig(fig_path, dpi=300)
plt.show()


### 6.2 Check outliers in specific columns

In [None]:
df_comp_train["age"].plot(kind = "box", title = "Age of Building")
# Save figure
fig_path = FIG_DIR / "boxplot_age.png"
plt.savefig(fig_path, dpi=300)
plt.show
# df_comp_train["geo_level_1_id"].plot(kind = "box", title = "geo_level_1_id")
# plt.show
# df_comp_train["geo_level_2_id"].plot(kind = "box", title = "geo_level_2_id")
# plt.show
# df_comp_train["geo_level_3_id"].plot(kind = "box", title = "geo_level_3_id")
# plt.show
# df_comp_train["count_floors_pre_eq"].plot(kind = "box", title = "Etagenanzahl")
# plt.show
# df_comp_train["area_percentage"].plot(kind = "box", title = "Grundfläche")
# plt.show
# df_comp_train["height_percentage"].plot(kind = "box", title = "Höhe")
# plt.show

### 6.3 Check outliers in age column

In [None]:
# Calculate mean and standard deviation
mean_age = df_comp_train["age"].mean()
std_age = df_comp_train["age"].std()

# Define outlier threshold (e.g., mean + 3*std)
outlier_threshold = mean_age + 3 * std_age

# Identify outliers
outliers = df_comp_train[df_comp_train["age"] > outlier_threshold]

# Plot the data with outliers highlighted
plt.figure(figsize=(10, 6))
plt.scatter(df_comp_train.index, df_comp_train["age"], label="Data", alpha=0.6)
plt.scatter(outliers.index, outliers["age"], color="r", label="Outliers", alpha=0.8)
plt.axhline(y=outlier_threshold, color="g", linestyle="--", label="Outlier threshold")
plt.title("Outlier Detection — Age", fontsize=14)
plt.xlabel("Index")
plt.ylabel("Building Age")
plt.legend()
plt.tight_layout()

# Save figure
fig_path = FIG_DIR / "outlier_age.png"
plt.savefig(fig_path, dpi=300)
plt.show()

In [None]:
# Check for extreme values (e.g., 995 years)
extreme_value = 995
extreme_cases = df_comp_train[df_comp_train["age"] >= extreme_value]

# Print results
print(f"Number of outliers above {outlier_threshold:.2f}: {len(outliers)}")
print(f"Number of extreme values (>= {extreme_value}): {len(extreme_cases)}")
extreme_cases.head()

# 7. Descriptive Statistics

In [None]:
df_comp_train.describe()

# 8. Histogram of features

In [None]:
df_comp_train.hist(figsize=(24, 16), bins=20)

# Save figure
fig_path = FIG_DIR / "histogram_features.png"
plt.savefig(fig_path, dpi=300)
plt.show()


# 9. Correlation matrix of numeric features

In [None]:
# Select numeric columns only
numeric_columns = df_comp_train.select_dtypes(include=["float64", "int64"]).columns

# Compute correlation matrix
corr_matrix = df_comp_train[numeric_columns].corr()

# Plot heatmap
plt.figure(figsize=(18, 14))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap="coolwarm",
    fmt=".2f",
    linewidths=0.5,
    cbar_kws={"shrink": 0.8}
)
plt.title("Correlation Matrix — Numeric Features", fontsize=16, pad=15)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()

# Save figure
fig_path = FIG_DIR / "correlation_matrix.png"
plt.savefig(fig_path, dpi=300)
plt.show()


# 10. Distribution of target variable

In [None]:
# Prepare data: count frequencies of each damage grade
df_damage_count = (
    df_comp_labels["damage_grade"]
    .value_counts()
    .sort_index()
    .reset_index()
)
df_damage_count.columns = ["damage_grade", "count"]
print(df_damage_count)

# Plot bar chart
plt.figure(figsize=(8, 5))
sns.barplot(
    x="damage_grade",
    hue="damage_grade",
    legend=False,
    y="count",
    data=df_damage_count,
    palette="Blues_d"
)

# Add title and labels
plt.title("Distribution of Damage Grades", fontsize=14, pad=10)
plt.xlabel("Damage Grade")
plt.ylabel("Count")
plt.tight_layout()

# Save figure
fig_path = FIG_DIR / "dist_target_var.png"
plt.savefig(fig_path, dpi=300)
plt.show()

# 11. Unique values for each colum


In [None]:
def unique_values_per_attribute(df: pd.DataFrame, max_values: int = 10) -> dict:
    """
    Get unique values for each column in a DataFrame (truncated for readability).

    Args:
        df (pd.DataFrame): Input DataFrame.
        max_values (int): Maximum number of values to display per column.

    Returns:
        dict: Dictionary mapping column names to their unique values (or a preview of them).
    """
    unique_dict = {}
    for col in df.columns:
        values = df[col].unique()
        # Truncate long value lists for cleaner output
        if len(values) > max_values:
            preview = values[:max_values]
            unique_dict[col] = f"{preview} ... ({len(values)} unique values)"
        else:
            unique_dict[col] = values
    return unique_dict

# Example usage
unique_values = unique_values_per_attribute(df_comp_train)

# Print results
for col, vals in unique_values.items():
    print(f"{col}: {vals}")