In [4]:

import math, os, sys, time, json, random
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from typing import Tuple
import pandas as pd

# Reproducibility
def set_seed(s: int = 42):
    random.seed(s); np.random.seed(s); torch.manual_seed(s)

dfr = pd.read_csv('data/raw/sav_0927_v2.csv')
dfr = dfr[dfr['REPORT_TYPE'] == 'FM-15']
dfr.head(21)

  dfr = pd.read_csv('data/raw/sav_0927_v2.csv')


Unnamed: 0,STATION,NAME,DATE,SOURCE,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,AA1,AA2,AA3,...,TMP,UA1,UG1,UG2,VIS,WA1,WD1,WG1,WJ1,WND
3,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T00:53:00,7,FM-15,KSAV,V030,1000095,,,...,2335,,,,"016093,5,N,5",,,,,"030,5,N,0041,5"
5,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T01:53:00,7,FM-15,KSAV,V030,1000095,,,...,2285,,,,"016093,5,N,5",,,,,"020,5,N,0031,5"
8,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T02:53:00,7,FM-15,KSAV,V030,1000095,,,...,2225,,,,"016093,5,N,5",,,,,"030,5,N,0036,5"
10,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T03:53:00,7,FM-15,KSAV,V030,1000095,,,...,2225,,,,"016093,5,N,5",,,,,"020,5,N,0036,5"
11,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T04:53:00,7,FM-15,KSAV,V030,1000095,,,...,2225,,,,"016093,5,N,5",,,,,"020,5,N,0031,5"
16,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T05:53:00,7,FM-15,KSAV,V030,1000095,,,...,2225,,,,"016093,5,N,5",,,,,"360,5,N,0031,5"
19,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T06:53:00,7,FM-15,KSAV,V030,1000095,,,...,2225,,,,"016093,5,N,5",,,,,"020,5,N,0036,5"
21,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T07:53:00,7,FM-15,KSAV,V030,1000095,,,...,2175,,,,"016093,5,N,5",,,,,"030,5,N,0036,5"
24,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T08:53:00,7,FM-15,KSAV,V030,1000095,,,...,2175,,,,"016093,5,N,5",,,,,"020,5,N,0031,5"
27,72207003822,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T09:53:00,7,FM-15,KSAV,V030,1000025,,,...,2115,,,,"009656,5,N,5",,,,,"020,5,N,0041,5"


In [10]:
import numpy as np
import pandas as pd

def dt_treatment(df, date_col='DATE', expected_minute=53, tolerance_min=15):
    df = (df.copy()
            .assign(DATE_raw=lambda x: x[date_col],
                    _dt=lambda x: pd.to_datetime(x[date_col], errors='coerce')))
    floor = df['_dt'].dt.floor('h')
    dev   = (df['_dt'] - floor - pd.Timedelta(minutes=expected_minute)).abs()
    tol   = pd.Timedelta(minutes=tolerance_min)

    df = df.assign(
        within_tol=dev <= tol,
        dt_nominal=floor + pd.Timedelta(hours=1),
        minutes_from_expected=dev.dt.total_seconds()/60,
        was_offschedule=df['_dt'].notna() & (dev > tol)
    )

    quarantine = df[df['was_offschedule'] | df['_dt'].isna()].copy()
    elig = df[df['within_tol'] & df['_dt'].notna()]
    dup_sizes = elig['dt_nominal'].value_counts()

    chosen = (elig.assign(_dev=dev)
                 .sort_values(['dt_nominal', '_dev', '_dt'])
                 .drop_duplicates('dt_nominal')
                 .assign(dup_count_before=lambda x: x['dt_nominal'].map(dup_sizes).fillna(0).astype(int),
                         was_snapped=True))

    spine = pd.date_range(df['_dt'].min().round('h'), df['_dt'].max().round('h'), freq='h')
    aligned = (chosen.set_index('dt_nominal')
                    .reindex(spine)
                    .rename_axis('t_utc')
                    .assign(was_snapped=lambda x: x['was_snapped'].astype(bool).fillna(False),
                            was_offschedule=lambda x: x['was_offschedule'].astype(bool).fillna(False),
                            dup_count_before=lambda x: x['dup_count_before'].fillna(0).astype(int)))

    hod = aligned.index.hour + aligned.index.minute/60
    doy = aligned.index.dayofyear + hod/24
    aligned = aligned.assign(hod_sin=np.sin(2*np.pi*hod/24),  hod_cos=np.cos(2*np.pi*hod/24),
                             doy_sin=np.sin(2*np.pi*doy/365), doy_cos=np.cos(2*np.pi*doy/365))

    row_count, index_hours = len(chosen), len(spine)
    qa = {
        'rows_total'    : len(df),
        'unparsable'    : int(df['_dt'].isna().sum()),
        'offschedule'   : int(df['was_offschedule'].sum()),
        'duplicates'    : int((dup_sizes > 1).sum()),
        'index_hours'   : index_hours,
        'missing_hours' : max(index_hours - row_count, 0),
        'coverage_pct'  : (len(df) / index_hours),
    }
    return aligned, qa, quarantine

result = dt_treatment(dfr)
df = result[0]
print("*-"*12 + " - QA REPORT - " + "*-"*12)
print(f"start : {df.index.min()} | end : {df.index.max()}")
print(result[1])
print("*-"*32)
df.head(25)

*-*-*-*-*-*-*-*-*-*-*-*- - QA REPORT - *-*-*-*-*-*-*-*-*-*-*-*-
start : 2015-09-27 01:00:00 | end : 2025-08-27 04:00:00
{'rows_total': 86813, 'unparsable': 0, 'offschedule': 4, 'duplicates': 4, 'index_hours': 86932, 'missing_hours': 127, 'coverage_pct': 0.9986311139741407}
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


Unnamed: 0_level_0,STATION,NAME,DATE,SOURCE,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,AA1,AA2,AA3,...,within_tol,minutes_from_expected,was_offschedule,_dev,dup_count_before,was_snapped,hod_sin,hod_cos,doy_sin,doy_cos
t_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-27 01:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T00:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.258819,0.9659258,-0.997963,-0.063793
2015-09-27 02:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T01:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.5,0.8660254,-0.998009,-0.063077
2015-09-27 03:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T02:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.7071068,0.7071068,-0.998054,-0.062361
2015-09-27 04:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T03:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.8660254,0.5,-0.998098,-0.061645
2015-09-27 05:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T04:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.9659258,0.258819,-0.998142,-0.060929
2015-09-27 06:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T05:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,1.0,6.123234000000001e-17,-0.998186,-0.060213
2015-09-27 07:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T06:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.9659258,-0.258819,-0.998228,-0.059497
2015-09-27 08:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T07:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.8660254,-0.5,-0.998271,-0.058781
2015-09-27 09:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T08:53:00,7,FM-15,KSAV,V030,1000095,,,...,True,0.0,False,0 days,1,True,0.7071068,-0.7071068,-0.998313,-0.058065
2015-09-27 10:00:00,72207000000.0,"SAVANNAH INTERNATIONAL AIRPORT, GA US",2015-09-27T09:53:00,7,FM-15,KSAV,V030,1000025,,,...,True,0.0,False,0 days,1,True,0.5,-0.8660254,-0.998354,-0.057349


In [77]:
nans = df[df['TMP'].isna()]
print(len(nans))
nans

127


Unnamed: 0_level_0,STATION,NAME,DATE,SOURCE,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,AA1,AA2,AA3,...,WA1,WD1,WG1,WJ1,WND,DATE_raw,minutes_from_expected,was_offschedule,dup_count_before,was_snapped
t_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-12-04 10:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
2015-12-09 18:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
2015-12-10 00:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
2015-12-16 06:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
2016-02-03 18:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-01 12:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
2025-05-30 13:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
2025-05-30 14:00:00,,,,,,,,,,,...,,,,,,,,True,0,True
2025-07-14 03:00:00,,,,,,,,,,,...,,,,,,,,True,0,True


In [11]:
outliers_domain['temp_C'].value_counts()

temp_C
999.9    80
Name: count, dtype: int64

168


Unnamed: 0,time_est,sin_doy,cos_doy,sin_hod,cos_hod,temp_C,temp_C_flag,C_temp_C
1641,2015-12-04 04:53:00,,,,,8.900000,False,8.900000
1769,2015-12-09 12:53:00,,,,,20.850000,False,20.850000
1775,2015-12-09 18:53:00,,,,,15.250000,False,15.250000
1925,2015-12-16 00:53:00,,,,,10.550000,False,10.550000
3113,2016-02-03 12:53:00,,,,,26.400000,False,26.400000
...,...,...,...,...,...,...,...,...
84107,2025-05-01 06:53:00,,,,,18.600000,False,18.600000
84804,2025-05-30 07:53:00,,,,,23.166667,False,23.166667
84805,2025-05-30 08:53:00,,,,,,False,
85874,2025-07-13 21:53:00,,,,,26.650000,False,26.650000
