# Endometrial Cancer Survival Analysis - Exploratory Data Analysis

This notebook explores the endometrial cancer dataset to understand:
- Data structure and quality
- Feature distributions
- Missing values patterns
- Potential predictors for survival analysis


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# Import our modules
import sys
sys.path.insert(0, '..')
from src.data_loader import load_excel_data, get_data_summary


## 1. Load Data


In [None]:
# Load the dataset
DATA_PATH = Path('..') / 'IQ_Cancer_Endometrio_merged_NMSP.xlsx'
df = load_excel_data(DATA_PATH)

print(f"Dataset shape: {df.shape}")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")


In [None]:
# Preview the data
df.head()


In [None]:
# Column names
print("All columns:")
for i, col in enumerate(df.columns):
    print(f"{i+1:3d}. {col}")


## 2. Data Types and Basic Statistics


In [None]:
# Data types summary
print("Data types:")
print(df.dtypes.value_counts())
print("\n")

# Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")


In [None]:
# Basic statistics for numeric columns
df.describe().T


## 3. Missing Values Analysis


In [None]:
# Calculate missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

# Show columns with missing values
missing_df[missing_df['Missing Count'] > 0]


In [None]:
# Visualize missing values
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Top 20 columns with most missing values
top_missing = missing_df[missing_df['Missing Count'] > 0].head(20)
if len(top_missing) > 0:
    axes[0].barh(top_missing.index, top_missing['Missing %'])
    axes[0].set_xlabel('Missing %')
    axes[0].set_title('Top 20 Columns with Missing Values')
    axes[0].invert_yaxis()

# Distribution of missing values
axes[1].hist(missing_pct[missing_pct > 0], bins=20, edgecolor='black')
axes[1].set_xlabel('Missing %')
axes[1].set_ylabel('Number of Columns')
axes[1].set_title('Distribution of Missing Values')

plt.tight_layout()
plt.show()


## 4. Target Variables (Outcomes)


In [None]:
# Look for potential outcome/event columns
outcome_keywords = ['recidiva', 'death', 'survival', 'status', 'event', 'muerte', 'fallec', 'exitus']
time_keywords = ['time', 'tiempo', 'survival', 'follow', 'seguimiento', 'meses', 'dias']

print("Potential outcome columns:")
for col in df.columns:
    if any(kw in col.lower() for kw in outcome_keywords):
        print(f"  - {col}: {df[col].unique()[:10]}")

print("\nPotential time columns:")
for col in df.columns:
    if any(kw in col.lower() for kw in time_keywords):
        print(f"  - {col}: min={df[col].min()}, max={df[col].max()}, mean={df[col].mean():.2f}")


In [None]:
# Analyze 'recidiva' (recurrence) if it exists
if 'recidiva' in df.columns:
    print("Recidiva (Recurrence) distribution:")
    print(df['recidiva'].value_counts())
    print(f"\nRecurrence rate: {df['recidiva'].mean()*100:.1f}%")
    
    fig, ax = plt.subplots(figsize=(6, 4))
    df['recidiva'].value_counts().plot(kind='bar', ax=ax, color=['#2ecc71', '#e74c3c'])
    ax.set_xlabel('Recidiva')
    ax.set_ylabel('Count')
    ax.set_title('Recurrence Distribution')
    ax.set_xticklabels(['No Recurrence', 'Recurrence'], rotation=0)
    plt.tight_layout()
    plt.show()


## 5. Feature Distributions


In [None]:
# Select numeric columns with reasonable ranges for visualization
viz_cols = [col for col in numeric_cols if df[col].nunique() > 2 and df[col].notna().sum() > 50]

# Plot distributions of first 12 numeric features
n_plots = min(12, len(viz_cols))
fig, axes = plt.subplots(3, 4, figsize=(16, 10))
axes = axes.flatten()

for i, col in enumerate(viz_cols[:n_plots]):
    data = df[col].dropna()
    axes[i].hist(data, bins=30, edgecolor='black', alpha=0.7)
    axes[i].set_title(col[:25], fontsize=10)
    axes[i].tick_params(labelsize=8)

# Hide empty subplots
for j in range(n_plots, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Distribution of Numeric Features', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()


## 6. Correlation Analysis


In [None]:
# Select columns with < 30% missing for correlation analysis
low_missing_cols = missing_df[missing_df['Missing %'] < 30].index.tolist()
corr_cols = [col for col in low_missing_cols if col in numeric_cols]

print(f"Analyzing correlations for {len(corr_cols)} columns with <30% missing")


In [None]:
# Correlation with recidiva (if exists)
if 'recidiva' in df.columns and len(corr_cols) > 0:
    correlations = df[corr_cols].corrwith(df['recidiva']).abs().sort_values(ascending=False)
    
    print("Top 20 features correlated with Recidiva:")
    print(correlations.head(20))
    
    # Plot top correlations
    fig, ax = plt.subplots(figsize=(10, 8))
    top_corr = correlations.head(20)
    ax.barh(range(len(top_corr)), top_corr.values)
    ax.set_yticks(range(len(top_corr)))
    ax.set_yticklabels(top_corr.index)
    ax.set_xlabel('Absolute Correlation')
    ax.set_title('Top 20 Features Correlated with Recidiva')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()


In [None]:
# Correlation heatmap for top features
if 'recidiva' in df.columns and len(corr_cols) > 0:
    top_features = correlations.head(15).index.tolist()
    
    fig, ax = plt.subplots(figsize=(12, 10))
    corr_matrix = df[top_features].corr()
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0, ax=ax)
    ax.set_title('Correlation Heatmap - Top Features')
    plt.tight_layout()
    plt.show()


## 7. Summary and Recommendations


In [None]:
# Generate summary
print("="*60)
print("DATASET SUMMARY")
print("="*60)
print(f"\nTotal patients: {len(df)}")
print(f"Total features: {len(df.columns)}")
print(f"  - Numeric: {len(numeric_cols)}")
print(f"  - Categorical: {len(categorical_cols)}")

print(f"\nMissing values:")
print(f"  - Columns with >50% missing: {len(missing_df[missing_df['Missing %'] > 50])}")
print(f"  - Columns with 0% missing: {len(missing_df[missing_df['Missing %'] == 0])}")

if 'recidiva' in df.columns:
    print(f"\nOutcome (Recidiva):")
    print(f"  - Recurrence rate: {df['recidiva'].mean()*100:.1f}%")
    print(f"  - Events: {int(df['recidiva'].sum())} / {len(df)}")

print("\n" + "="*60)
print("RECOMMENDATIONS")
print("="*60)
print("\n1. Consider removing columns with >50% missing values")
print("2. Identify the survival time column for Cox PH analysis")
print("3. Focus on features with highest correlation to outcome")
print("4. Handle categorical variables with encoding")
