# Exploratory Data Analysis (EDA)

This notebook performs exploratory data analysis on the urine metabolomics dataset.

⚠️ **ETHICS NOTE**: This notebook uses synthetic data for demonstration. Real patient data must never be uploaded to public repositories.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load data
data_path = Path('../data/synthetic/synthetic_urine_metabolomics.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns[:15])}... (and {len(df.columns) - 15} metabolite columns)")


In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\n" + "="*50)
print("\nFirst few rows:")
df.head()


In [None]:
# Diagnosis label distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

diagnosis_counts = df['diagnosis_label'].value_counts()
axes[0].bar(diagnosis_counts.index, diagnosis_counts.values)
axes[0].set_title('Diagnosis Label Distribution')
axes[0].set_xlabel('Diagnosis')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

diagnosis_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
axes[1].set_title('Diagnosis Label Proportions')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

print(diagnosis_counts)
