# WiDS Datathon 2026 EDA: Kaylee

## Basic Dataset

In [16]:
import pandas as pd

# Load data and get shapes of datasets
print("=" * 80)
print("LOADING DATA & GETTING DATASET SHAPES")
print("=" * 80)
train = pd.read_csv('WiDSWorldWide_GlobalDathon26/train.csv')
test = pd.read_csv('WiDSWorldWide_GlobalDathon26/test.csv')
metadata = pd.read_csv('WiDSWorldWide_GlobalDathon26/metaData.csv')

# shape and excerpt of data sets 
print(f"\nTrain shape: {train.shape}")
print(f"\nTraining data first five rows:")
print(train.head())
print(f"Test shape: {test.shape}")

LOADING DATA & GETTING DATASET SHAPES

Train shape: (221, 37)

Training data first five rows:
   event_id  num_perimeters_0_5h  dt_first_last_0_5h  \
0  10892457                    3            4.265188   
1  11757157                    2            1.169918   
2  11945086                    4            4.777526   
3  12044083                    1            0.000000   
4  12052347                    2            4.975273   

   low_temporal_resolution_0_5h  area_first_ha  area_growth_abs_0_5h  \
0                             0      79.696304              2.875935   
1                             0       8.946749              0.000000   
2                             0     106.482638              0.000000   
3                             1      67.631125              0.000000   
4                             0      35.632874              0.000000   

   area_growth_rel_0_5h  area_growth_rate_ha_per_h  log1p_area_first  \
0              0.036086                   0.674281          4.39

In [13]:
# More in depth, basic overview of data
target = 'event'
time_target = 'time_to_hit_hours'
id_col = 'event_id'
feature_cols = [col for col in train.columns if col not in [id_col, target, time_target]]

print(f"\n{'='*80}")
print("DATA SUMMARY")
print(f"{'='*80}")
print(f"Total features: {len(feature_cols)}")
print(f"Target distribution:")
print(f"  - Event=0 (Censored): {(train[target]==0).sum()} ({(train[target]==0).mean()*100:.1f}%)")
print(f"  - Event=1 (Hit):      {(train[target]==1).sum()} ({(train[target]==1).mean()*100:.1f}%)")

print(f"\nTime to hit statistics:")
print(f"  - Overall range: [{train[time_target].min():.1f}, {train[time_target].max():.1f}] hours")
hits_only = train[train[target] == 1]
print(f"  - For hits (event=1): Mean={hits_only[time_target].mean():.1f}h, Median={hits_only[time_target].median():.1f}h")

# Check for missing values
missing = train[feature_cols].isnull().sum()
if missing.sum() == 0:
    print(f"\n No missing values detected!")
else:
    print(f"\n Missing values found in {(missing > 0).sum()} features")



DATA SUMMARY
Total features: 34
Target distribution:
  - Event=0 (Censored): 152 (68.8%)
  - Event=1 (Hit):      69 (31.2%)

Time to hit statistics:
  - Overall range: [0.0, 67.0] hours
  - For hits (event=1): Mean=10.0h, Median=3.5h

 No missing values detected!


There are ~69% of the data points in the training set that are a censored event, and ~31% that the fire hits. The range provided for the data is 72 hours, but no data is measured that hits past 67 hours. The average time in the event of fire hitting is 10 hours after first obsservation, while the median is much quicker at 3.5 hours. No missing values in the dataset.

## Feature Exploration

In [None]:
# using metadata to map features by type 
category_map = dict(zip(metadata['column'], metadata['category']))
feature_categories = {}
for col in feature_cols:
    cat = category_map.get(col, 'unknown')
    if cat not in feature_categories:
        feature_categories[cat] = []
    feature_categories[cat].append(col)

print(f"\n{'='*80}")
print("FEATURES BY CATEGORY")
print(f"{'='*80}")
for cat, cols in sorted(feature_categories.items()):
    print(f"\n{cat.upper()} ({len(cols)} features):")
    for col in cols[:5]:  # Show first 5
        print(f"  - {col}")
    if len(cols) > 5:
        print(f"  ... and {len(cols)-5} more")


FEATURES BY CATEGORY

CENTROID_KINEMATICS (5 features):
  - centroid_displacement_m
  - centroid_speed_m_per_h
  - spread_bearing_deg
  - spread_bearing_sin
  - spread_bearing_cos

DIRECTIONALITY (4 features):
  - alignment_cos
  - alignment_abs
  - cross_track_component
  - along_track_speed

DISTANCE (9 features):
  - dist_min_ci_0_5h
  - dist_std_ci_0_5h
  - dist_change_ci_0_5h
  - dist_slope_ci_0_5h
  - closing_speed_m_per_h
  ... and 4 more

GROWTH (10 features):
  - area_first_ha
  - area_growth_abs_0_5h
  - area_growth_rel_0_5h
  - area_growth_rate_ha_per_h
  - log1p_area_first
  ... and 5 more

TEMPORAL_COVERAGE (3 features):
  - num_perimeters_0_5h
  - dt_first_last_0_5h
  - low_temporal_resolution_0_5h

TEMPORAL_METADATA (3 features):
  - event_start_hour
  - event_start_dayofweek
  - event_start_month


## Statistical Summary 

In [18]:
# create statistical summary of features, any that have a large amount of zeroes 
stats_summary = train[feature_cols].describe().T
stats_summary['zeros_pct'] = (train[feature_cols] == 0).sum() / len(train) * 100
stats_summary['skewness'] = train[feature_cols].skew()

print(f"\n{'='*80}")
print("HIGH SPARSITY FEATURES (>80% zeros)")
print(f"{'='*80}")
high_zeros = stats_summary[stats_summary['zeros_pct'] > 80].sort_values('zeros_pct', ascending=False)
if len(high_zeros) > 0:
    print(high_zeros[['mean', 'std', 'zeros_pct']].head(10))
else:
    print("None found")


HIGH SPARSITY FEATURES (>80% zeros)
                                mean         std  zeros_pct
projected_advance_m        10.286955  128.652678  91.855204
closing_speed_abs_m_per_h   3.661135   26.690409  91.855204
closing_speed_m_per_h       2.021403   26.865184  91.855204
dist_change_ci_0_5h       -10.286955  128.652678  91.855204
dist_fit_r2_0_5h            0.046000    0.171690  91.402715
dist_std_ci_0_5h            8.079022   63.184352  91.402715
log1p_growth                0.389346    1.340348  89.140271
area_growth_abs_0_5h       26.332398  187.437018  88.687783
spread_bearing_sin          0.053662    0.285193  88.687783
cross_track_component       1.617188   37.789199  88.687783


10 of the features in dataset have high sparsity, including a large amount of zero values. Further exploration into indications needed here.

## Correlation Analysis

In [19]:
# Correlation with target
correlations = train[feature_cols].corrwith(train[target])
correlations_sorted = correlations.abs().sort_values(ascending=False)

print(f"\n{'='*80}")
print("TOP 10 FEATURES CORRELATED WITH TARGET")
print(f"{'='*80}")
for feat in correlations_sorted.head(10).index:
    corr_val = correlations[feat]
    print(f"{feat:40s}: {corr_val:+.4f}")

# Feature intercorrelations
print(f"\nSearching for highly correlated feature pairs (|r| > 0.8)...")
corr_matrix = train[feature_cols].corr()
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append({
                'feature_1': corr_matrix.columns[i],
                'feature_2': corr_matrix.columns[j],
                'correlation': corr_matrix.iloc[i, j]
            })

print(f"Found {len(high_corr_pairs)} highly correlated pairs")
if len(high_corr_pairs) > 0:
    print("\nTop 5 correlated pairs:")
    for pair in sorted(high_corr_pairs, key=lambda x: abs(x['correlation']), reverse=True)[:5]:
        print(f"  {pair['feature_1']:30s} <-> {pair['feature_2']:30s}: {pair['correlation']:.3f}")


TOP 10 FEATURES CORRELATED WITH TARGET
dist_min_ci_0_5h                        : -0.4814
low_temporal_resolution_0_5h            : -0.3791
num_perimeters_0_5h                     : +0.3705
dt_first_last_0_5h                      : +0.3530
alignment_abs                           : +0.3491
spread_bearing_cos                      : -0.3232
log1p_growth                            : +0.2927
spread_bearing_deg                      : +0.2810
log_area_ratio_0_5h                     : +0.2293
radial_growth_rate_m_per_h              : +0.2150

Searching for highly correlated feature pairs (|r| > 0.8)...
Found 65 highly correlated pairs

Top 5 correlated pairs:
  area_growth_rel_0_5h           <-> relative_growth_0_5h          : 1.000
  dist_change_ci_0_5h            <-> projected_advance_m           : -1.000
  dist_change_ci_0_5h            <-> closing_speed_m_per_h         : -0.998
  closing_speed_m_per_h          <-> projected_advance_m           : 0.998
  dist_std_ci_0_5h               <-> c