In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

project_root = Path.cwd().parents[1]
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from config import DATA_DIR_GOLD, LISTINGS_GOLD, NUMERIC_FEATURES, CATEGORICAL_FEATURES, TARGET_VARIABLE
from elferspot_listings.utils.helpers import load_data

sns.set_theme(style='whitegrid')

In [None]:
gold_files = sorted(DATA_DIR_GOLD.glob('listings_gold*.xlsx'))
gold_path = gold_files[-1] if gold_files else LISTINGS_GOLD
if not gold_path.exists():
    raise FileNotFoundError(f'Gold dataset missing at {gold_path}')

df_gold = load_data(gold_path)
print(f'Loaded {len(df_gold):,} rows from {gold_path.name}')
df_gold.head()

In [None]:
numeric_cols = [col for col in NUMERIC_FEATURES if col in df_gold.columns]
categorical_cols = [col for col in CATEGORICAL_FEATURES if col in df_gold.columns]
feature_cols = numeric_cols + categorical_cols

df_features = df_gold[feature_cols].copy()
for col in numeric_cols:
    df_features[col] = pd.to_numeric(df_features[col], errors='coerce').fillna(df_features[col].median())

df_dummies = pd.get_dummies(df_features[categorical_cols], drop_first=True) if categorical_cols else pd.DataFrame(index=df_features.index)
design_matrix = pd.concat([df_features[numeric_cols], df_dummies], axis=1).dropna()
print(f'Design matrix shape: {design_matrix.shape}')
design_matrix.head()

In [None]:
vif_data = []
values = design_matrix.values
for idx, column in enumerate(design_matrix.columns):
    vif = variance_inflation_factor(values, idx)
    vif_data.append({'feature': column, 'vif': vif})

vif_df = pd.DataFrame(vif_data).sort_values('vif', ascending=False)
vif_df.head(20)

In [None]:
corr_matrix = design_matrix.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0, square=False, linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()