In [1]:
import pandas as pd
from typing import Tuple, Optional

## Step 1: Load and clean tower data

In [2]:
GLOBAL_PATH="/home/jose/DATA_WEATHER_ORNL/data"

# Specify towers and variables of interest
towers_of_interest = ['TOWA', 'TOWB', 'TOWD', 'TOWF', 'TOWS', 'TOWY']
vars = ['TempC',
        'RelHum', 'AbsHum', 
        'WSpdMph', 'PkWSpdMph', 'VSSpdMph',
        'SolarRadWm2', 
        'BarPresMb',
        'Sigma', 'SigPhi',
        'WDir',
        'PrecipIn']


tower_dfs_15m_clean = []
for tower in towers_of_interest:

    df = pd.read_csv(f'{GLOBAL_PATH}/{tower}_2017-2022_final-qc.csv', 
                     header=0, skipfooter=1, na_values=[-999, '-999'], 
                     engine='python', parse_dates=True)
    
    df['timestampUTC'] = pd.to_datetime(df['timestampUTC'], format='%Y%m%d%H%M%S').dt.tz_localize('UTC')
    df = df.set_index('timestampUTC', drop=True)

    print(f"{tower}: {df.shape[0]} rows × {df.shape[1]} columns")  # <-- print dataset size

    tower_dfs_15m_clean.append(df)



TOWA: 210287 rows × 18 columns
TOWB: 210287 rows × 18 columns
TOWD: 210287 rows × 28 columns
TOWF: 210287 rows × 12 columns
TOWS: 210287 rows × 7 columns
TOWY: 210287 rows × 18 columns


In [3]:
tower_dfs_15m_clean

[                           TempC_015m  TempC_030m  RelHum_015m  AbsHum_015m  \
 timestampUTC                                                                  
 2017-01-01 05:00:00+00:00         3.6         3.6         97.4          6.0   
 2017-01-01 05:15:00+00:00         3.6         3.6         97.3          6.0   
 2017-01-01 05:30:00+00:00         3.6         3.7         97.4          6.0   
 2017-01-01 05:45:00+00:00         3.7         3.7         97.4          6.0   
 2017-01-01 06:00:00+00:00         3.8         3.8         97.3          6.0   
 ...                               ...         ...          ...          ...   
 2023-01-01 03:30:00+00:00        10.1        12.3         98.5          9.2   
 2023-01-01 03:45:00+00:00        10.2        11.8         98.6          9.3   
 2023-01-01 04:00:00+00:00        10.4        11.7         98.6          9.4   
 2023-01-01 04:15:00+00:00        10.2        11.7         98.6          9.3   
 2023-01-01 04:30:00+00:00        10.0  

## Step 2: Detect extreme events per tower

In [4]:
# Event thresholds and required minimum duration (hours)
EVENT_SPECS = {
    # Event name: (required variables, condition function, minimum duration in hours)
    # "E1_TempMoistHaz": (
    #     ["TempC", "AbsHum"],
    #     lambda df, col: (df[col["TempC"]] > 24.0) & (df[col["AbsHum"]] > 20.0),
    #     2.0
    # ),
    # "E2_WindChill": (
    #     ["TempC", "PkWSpdMph"],
    #     lambda df, col: (df[col["TempC"]] <= 4.8) & (df[col["PkWSpdMph"]] >= 3.0),
    #     2.0
    # ),
    "E3_LowTemp_lt0":   (["TempC"], lambda df, col: df[col["TempC"]] < 0.0,   2.0),
    # "E3_LowTemp_lt-5":  (["TempC"], lambda df, col: df[col["TempC"]] < -5.0,  2.0),
    # "E3_LowTemp_lt-10": (["TempC"], lambda df, col: df[col["TempC"]] < -10.0, 2.0),

    "E4_HighWind_Peak_gt25": (["PkWSpdMph"], lambda df, col: df[col["PkWSpdMph"]] > 25.0, 1.0),

    "E5_LowWind_lt2":   (["PkWSpdMph"], lambda df, col: df[col["PkWSpdMph"]] < 2.0,  3.0), # IF not enoguh cases NOT 5.0

    # "E6_HighTemp_gt24":   (["TempC"], lambda df, col: df[col["TempC"]] > 38.0,  2.0), # HIGHER THAN 24 FOR 2 HORUS24.0


    # SELECTED LOW WIND SPEED, HIG WIND, LOW TEMP!

    # "E5_LowWind_lt1":   (["PkWSpdMph"], lambda df, col: df[col["PkWSpdMph"]] < 1.0,  3.0),
    # "E5_LowWind_lt0_5": (["PkWSpdMph"], lambda df, col: df[col["PkWSpdMph"]] < 0.5, 3.0),

    # 35C high temp
}

# Sensor columns used for each tower 
colmap_per_tower = {
    "TOWA": {"TempC":"TempC_030m","AbsHum":"AbsHum_015m","PkWSpdMph":"PkWSpdMph_030m"},
    "TOWB": {"TempC":"TempC_030m","AbsHum":None,          "PkWSpdMph":"PkWSpdMph_030m"},
    "TOWD": {"TempC":"TempC_035m","AbsHum":"AbsHum_015m","PkWSpdMph":"PkWSpdMph_035m"},
    "TOWF": {"TempC":"TempC_010m","AbsHum":"AbsHum_010m","PkWSpdMph":"PkWSpdMph_010m"},
    "TOWS": {"TempC":"TempC_025m","AbsHum":None,          "PkWSpdMph":"PkWSpdMph_025m"},
    "TOWY": {"TempC":"TempC_033m","AbsHum":None,          "PkWSpdMph":"PkWSpdMph_033m"},
}



In [5]:
# infer the median step size of the time index (minutes)
def infer_step_minutes(index: pd.DatetimeIndex, fallback=15.0) -> float:
    if len(index) < 2:
        return float(fallback)
    diffs = pd.Series(index).diff().dropna().dt.total_seconds() / 60.0
    return float(diffs.median()) if not diffs.empty else float(fallback)
    

# convert a boolean time series into continuous event segments
def boolean_runs_to_segments(mask: pd.Series, min_duration_min: float) -> list:
    """
    MODIFIED: Keep ALL events but add duration metadata
    """
    if mask.empty:
        return []
    # Identify transition points (True blocks)
    m = mask.astype(bool).copy()
    change = m.ne(m.shift(1, fill_value=False))
    starts = m & change
    ends   = (~m) & change
    start_times = list(mask.index[starts])
    end_times   = list(mask.index[ends])

    # If the series ends with True, close the last block at the end
    if len(start_times) > len(end_times):
        end_times.append(mask.index[-1])

    segs = []
    for st, et in zip(start_times, end_times):
        dur = (et - st).total_seconds()/60.0
        meets_min_duration = dur >= min_duration_min
        # Keep ALL events, but add metadata about duration threshold
        segs.append((st, et, dur, meets_min_duration))
    return segs

# Main function: classify events for one tower 
def classify_events_for_tower(df: pd.DataFrame, tower_name: str) -> pd.DataFrame:
    """
    MODIFIED: Keep all events but record if they meet minimum duration
    """
    cmap = colmap_per_tower.get(tower_name, {})
    idx = pd.to_datetime(df.index)
    step_min = infer_step_minutes(idx, fallback=15.0)

    out = []
    for ev, (needed_keys, cond_fn, min_hours) in EVENT_SPECS.items():
        # Skip events if the tower is missing required variables
        ok = True
        for k in needed_keys:
            colname = cmap.get(k)
            if (colname is None) or (colname not in df.columns):
                ok = False
                break
        if not ok:
            continue

        # Evaluate the condition; any row with NaN is treated as False
        sub = df[[cmap[k] for k in needed_keys]].copy()
        mask = cond_fn(df, cmap) & (~sub.isna().any(axis=1))
        mask = pd.Series(mask.values, index=idx)

        # Convert True blocks to event segments (keep ALL events)
        min_duration_min = min_hours * 60.0
        segs = boolean_runs_to_segments(mask, min_duration_min=min_duration_min)
        
        for st, et, dur, meets_min_duration in segs:
            out.append({
                "tower": tower_name,
                "event": ev,
                "start": st,
                "end": et,
                "duration_minutes": float(dur),
                "meets_min_duration": meets_min_duration,
                "min_duration_required_min": min_duration_min
            })

    return pd.DataFrame(out)

# Batch processing across all towers 
def classify_all_towers(tower_dfs_15m_clean: list, towers_of_interest: list) -> pd.DataFrame:
    """
    Input: list of tower DataFrames (same order as towers_of_interest)
    Output: combined DataFrame with event segments across towers
    """
    all_rows = []
    for tower, df in zip(towers_of_interest, tower_dfs_15m_clean):
        seg = classify_events_for_tower(df, tower)
        all_rows.append(seg)
    if not all_rows:
        return pd.DataFrame(columns=["tower","event","start","end","duration_minutes","meets_min_duration","min_duration_required_min"])
    out = pd.concat(all_rows, ignore_index=True).sort_values(["tower","event","start"])
    return out

# NEW FUNCTION: Create fully labeled dataset with event flags for every timestamp
def create_fully_labeled_dataset(tower_dfs_15m_clean: list, towers_of_interest: list) -> pd.DataFrame:
    """
    Create a DataFrame where EVERY original data point is labeled with event information
    Returns: Combined DataFrame with all original data + event labels
    Structure: TOWER | TIMESTAMP | Original Variables | Event Flags
    """
    
    all_labeled_dfs = []
    
    for tower, df in zip(towers_of_interest, tower_dfs_15m_clean):
        print(f"Labeling data for {tower}...")
        
        # Create a copy of the original data
        labeled_df = df.copy()
        labeled_df['tower'] = tower
        labeled_df['timestamp'] = labeled_df.index  # Explicit timestamp column
        
        # Get events for this specific tower
        tower_events = classify_events_for_tower(df, tower)
        
        # Initialize ALL event columns (even if no events detected)
        all_event_types = list(EVENT_SPECS.keys())
        for event in all_event_types:
            labeled_df[f'event_{event}'] = False
            labeled_df[f'event_{event}_duration_min'] = 0.0
            labeled_df[f'event_{event}_meets_duration'] = False
        
        # Label each timestamp with event information
        if not tower_events.empty:
            for _, event_row in tower_events.iterrows():
                event_name = event_row['event']
                start_time = event_row['start']
                end_time = event_row['end']
                duration = event_row['duration_minutes']
                meets_duration = event_row['meets_min_duration']
                
                # Mark timestamps within this event period
                event_mask = (labeled_df.index >= start_time) & (labeled_df.index <= end_time)
                labeled_df.loc[event_mask, f'event_{event_name}'] = True
                labeled_df.loc[event_mask, f'event_{event_name}_duration_min'] = duration
                labeled_df.loc[event_mask, f'event_{event_name}_meets_duration'] = meets_duration
        
        # Create summary columns
        def count_active_events(row):
            count = 0
            for event in all_event_types:
                if row[f'event_{event}']:
                    count += 1
            return count
        
        def get_active_event_list(row):
            active_events = []
            for event in all_event_types:
                if row[f'event_{event}']:
                    meets_dur = row[f'event_{event}_meets_duration']
                    status = event if meets_dur else f"{event}*"  # * for short events
                    active_events.append(status)
            return ','.join(active_events) if active_events else 'none'
        
        def get_event_durations(row):
            durations = []
            for event in all_event_types:
                if row[f'event_{event}']:
                    dur = row[f'event_{event}_duration_min']
                    durations.append(f"{event}:{dur:.1f}min")
            return ','.join(durations) if durations else 'none'
        
        labeled_df['event_count'] = labeled_df.apply(count_active_events, axis=1)
        labeled_df['active_events'] = labeled_df.apply(get_active_event_list, axis=1)
        labeled_df['event_durations'] = labeled_df.apply(get_event_durations, axis=1)
        labeled_df['has_any_event'] = labeled_df['active_events'] != 'none'
        
        all_labeled_dfs.append(labeled_df)
    
    # Combine all towers into one big labeled dataset
    if all_labeled_dfs:
        full_labeled_dataset = pd.concat(all_labeled_dfs, axis=0)
        full_labeled_dataset = full_labeled_dataset.sort_values(['timestamp', 'tower'])
        return full_labeled_dataset
    else:
        return pd.DataFrame()

# Run the complete pipeline
print("Step 1: Loading and preprocessing data...")
print(f"Loaded {len(tower_dfs_15m_clean)} towers")

print("\nStep 2: Creating fully labeled dataset...")
labeled_data = create_fully_labeled_dataset(tower_dfs_15m_clean, towers_of_interest)

print(f"Labeled dataset shape: {labeled_data.shape}")
print(f"Total data points: {len(labeled_data):,}")
print(f"Towers: {labeled_data['tower'].unique().tolist()}")

# Analysis of labeling results
print("\n=== LABELING RESULTS ===")
print(f"Total timestamps: {len(labeled_data):,}")
print(f"Timestamps with ANY event: {labeled_data['has_any_event'].sum():,} ({labeled_data['has_any_event'].mean()*100:.2f}%)")

# Event frequency analysis
event_cols = [col for col in labeled_data.columns if col.startswith('event_E') and not any(x in col for x in ['duration', 'meets'])]
print(f"\nEvent frequencies:")
for event_col in event_cols:
    event_name = event_col.replace('event_', '')
    count = labeled_data[event_col].sum()
    if count > 0:
        percentage = (count / len(labeled_data)) * 100
        print(f"  {event_name}: {count:,} timestamps ({percentage:.2f}%)")

# Show sample of the final structure
print("\n=== SAMPLE OF LABELED DATA ===")
sample_columns = ['tower', 'timestamp', 'event_count', 'active_events', 'has_any_event']
event_sample_cols = [col for col in labeled_data.columns if col.startswith('event_E') and not any(x in col for x in ['duration', 'meets'])]
sample_columns.extend(event_sample_cols[:3])  # Show first 3 event types

print(labeled_data[sample_columns].head(15))

# Save the fully labeled dataset
print("\nStep 3: Saving results...")
labeled_data.to_csv("fully_labeled_weather_data_with_events.csv", index=False)


Step 1: Loading and preprocessing data...
Loaded 6 towers

Step 2: Creating fully labeled dataset...
Labeling data for TOWA...
Labeling data for TOWB...
Labeling data for TOWD...
Labeling data for TOWF...
Labeling data for TOWS...
Labeling data for TOWY...
Labeled dataset shape: (1261722, 78)
Total data points: 1,261,722
Towers: ['TOWA', 'TOWB', 'TOWD', 'TOWF', 'TOWS', 'TOWY']

=== LABELING RESULTS ===
Total timestamps: 1,261,722
Timestamps with ANY event: 197,082 (15.62%)

Event frequencies:
  E3_LowTemp_lt0: 61,463 timestamps (4.87%)
  E4_HighWind_Peak_gt25: 28,607 timestamps (2.27%)
  E5_LowWind_lt2: 114,583 timestamps (9.08%)

=== SAMPLE OF LABELED DATA ===
                          tower                 timestamp  event_count  \
timestampUTC                                                             
2017-01-01 05:00:00+00:00  TOWA 2017-01-01 05:00:00+00:00            0   
2017-01-01 05:00:00+00:00  TOWB 2017-01-01 05:00:00+00:00            0   
2017-01-01 05:00:00+00:00  TOWD 20