# WiDS Datathon 2026 EDA: Kaylee

## Basic Dataset

In [11]:
import pandas as pd

# Load data and get shapes of datasets
print("=" * 80)
print("LOADING DATA & GETTING DATASET SHAPES")
print("=" * 80)
train = pd.read_csv('WiDSWorldWide_GlobalDathon26/train.csv')
test = pd.read_csv('WiDSWorldWide_GlobalDathon26/test.csv')
metadata = pd.read_csv('WiDSWorldWide_GlobalDathon26/metaData.csv')

print(f"\nTrain shape: {train.shape}")
print(f"Test shape: {test.shape}")

LOADING DATA & GETTING DATASET SHAPES

Train shape: (221, 37)
Test shape: (95, 35)


In [13]:
# More in depth, basic overview of data
target = 'event'
time_target = 'time_to_hit_hours'
id_col = 'event_id'
feature_cols = [col for col in train.columns if col not in [id_col, target, time_target]]

print(f"\n{'='*80}")
print("DATA SUMMARY")
print(f"{'='*80}")
print(f"Total features: {len(feature_cols)}")
print(f"Target distribution:")
print(f"  - Event=0 (Censored): {(train[target]==0).sum()} ({(train[target]==0).mean()*100:.1f}%)")
print(f"  - Event=1 (Hit):      {(train[target]==1).sum()} ({(train[target]==1).mean()*100:.1f}%)")

print(f"\nTime to hit statistics:")
print(f"  - Overall range: [{train[time_target].min():.1f}, {train[time_target].max():.1f}] hours")
hits_only = train[train[target] == 1]
print(f"  - For hits (event=1): Mean={hits_only[time_target].mean():.1f}h, Median={hits_only[time_target].median():.1f}h")

# Check for missing values
missing = train[feature_cols].isnull().sum()
if missing.sum() == 0:
    print(f"\n No missing values detected!")
else:
    print(f"\n Missing values found in {(missing > 0).sum()} features")



DATA SUMMARY
Total features: 34
Target distribution:
  - Event=0 (Censored): 152 (68.8%)
  - Event=1 (Hit):      69 (31.2%)

Time to hit statistics:
  - Overall range: [0.0, 67.0] hours
  - For hits (event=1): Mean=10.0h, Median=3.5h

 No missing values detected!


There are ~69% of the data points in the training set that are a censored event, and ~31% that the fire hits. The range provided for the data is 72 hours, but no data is measured that hits past 67 hours. The average time in the event of fire hitting is 10 hours after first obsservation, while the median is much quicker at 3.5 hours. No missing values in the dataset.

## Feature Exploration

In [14]:
category_map = dict(zip(metadata['column'], metadata['category']))
feature_categories = {}
for col in feature_cols:
    cat = category_map.get(col, 'unknown')
    if cat not in feature_categories:
        feature_categories[cat] = []
    feature_categories[cat].append(col)

print(f"\n{'='*80}")
print("FEATURES BY CATEGORY")
print(f"{'='*80}")
for cat, cols in sorted(feature_categories.items()):
    print(f"\n{cat.upper()} ({len(cols)} features):")
    for col in cols[:5]:  # Show first 5
        print(f"  - {col}")
    if len(cols) > 5:
        print(f"  ... and {len(cols)-5} more")


FEATURES BY CATEGORY

CENTROID_KINEMATICS (5 features):
  - centroid_displacement_m
  - centroid_speed_m_per_h
  - spread_bearing_deg
  - spread_bearing_sin
  - spread_bearing_cos

DIRECTIONALITY (4 features):
  - alignment_cos
  - alignment_abs
  - cross_track_component
  - along_track_speed

DISTANCE (9 features):
  - dist_min_ci_0_5h
  - dist_std_ci_0_5h
  - dist_change_ci_0_5h
  - dist_slope_ci_0_5h
  - closing_speed_m_per_h
  ... and 4 more

GROWTH (10 features):
  - area_first_ha
  - area_growth_abs_0_5h
  - area_growth_rel_0_5h
  - area_growth_rate_ha_per_h
  - log1p_area_first
  ... and 5 more

TEMPORAL_COVERAGE (3 features):
  - num_perimeters_0_5h
  - dt_first_last_0_5h
  - low_temporal_resolution_0_5h

TEMPORAL_METADATA (3 features):
  - event_start_hour
  - event_start_dayofweek
  - event_start_month
