In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import sklearn
import geopandas as gpd
import matplotlib.pyplot as plt
import subprocess
import sys
import seaborn as sns

import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os 
import pandas as pd


SEED = 42

def manual_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    # if you are suing GPU
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

manual_seed(SEED)



# Load Data & Preprocess

In [None]:
data = pd.read_csv('../input/btlaionkk/data_onkk_merged.csv')
data.head()

In [None]:
import math
from tqdm import tqdm

def preprocess_data(df):
    df = df.copy()
    full_dates = pd.date_range(start=df['time'].min(), end=df['time'].max(), freq='D')
    station_data_list = []
    for station_id in tqdm(df.ID.unique()):
        station_data = df[df.ID == station_id].copy()
        if len(station_data) == 0: continue
        station_data['time']=pd.to_datetime(station_data['time'])
        station_data_daily = station_data.set_index('time').reindex(full_dates).rename_axis('time').reset_index()

        ### Preprocess time-dependent features
        station_data_daily['pm25_lag1'] = station_data_daily.pm25 - station_data_daily.pm25.shift(1)
        
        station_data_daily['lat'] = np.nanmean(station_data_daily['lat'].values)
        station_data_daily['lon'] = np.nanmean(station_data_daily['lon'].values)
        station_data_daily['ID'] = np.nanmean(station_data_daily['ID'].values)

        ### Gather station data
        station_data_list += [station_data_daily]

    df = pd.concat(station_data_list, axis=0)



    ### Preprocess time-independent features
    df['WDIR_x'] = np.cos(np.radians(df['WDIR']))
    df['WDIR_y'] = np.sin(np.radians(df['WDIR']))
    df['time'] = pd.to_datetime(df['time'])
    df["day_of_year"] = df["time"].dt.dayofyear
    df["sin_day"] = np.sin(2 * np.pi * df["day_of_year"] / 365)
    df["cos_day"] = np.cos(2 * np.pi * df["day_of_year"] / 365)
    df['wind_u'] = df['WSPD'] * np.cos(np.radians(df['WDIR']))
    df['wind_v'] = df['WSPD'] * np.sin(np.radians(df['WDIR']))
    df['temp_range'] = df['TX'] - df['TN']
    
    df['time'] = pd.to_datetime(df['time'])
    df['day_of_week'] = df['time'].dt.dayofweek
    df['month'] = df['time'].dt.month
    
    def get_season(month):
        if month in [12, 1, 2]:
            return '4'
        elif month in [3, 4, 5]:
            return '1'
        elif month in [6, 7, 8]:
            return '2'
        elif month in [9, 10, 11]:
            return '3'
            
    df['season'] = df['month'].apply(get_season).astype(int)
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['heat_index'] = df['TMP'] * df['RH']
    
    def calculate_dew_point(temp, rh):
        a = 17.27
        b = 237.7
        gamma = np.log(rh / 100.0) + (a * temp) / (b + temp)
        dew_point = (b * gamma) / (a - gamma)
        return dew_point
        
    df['dew_point'] = df.apply(lambda row: calculate_dew_point(row['TMP'], row['RH']), axis=1)
    
    hanoi_lat, hanoi_lon = 21.0278, 105.8342
    def haversine_distance(row, lat2, lon2):
        lat1, lon1 = row['lat'], row['lon']
        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
        
        dlat = lat2 - lat1 
        dlon = lon2 - lon1 
        a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
        c = 2 * math.asin(math.sqrt(a))
        r = 6371 
        return c * r
    
    df['distance_to_hanoi'] = df.apply(lambda row: haversine_distance(row, hanoi_lat, hanoi_lon), axis=1)
    
    # df['inversion_strength'] = df['TX'] - df['TN']
    df['temp_wind'] = df['TMP'] * df['WSPD']
    df['rh_pressure'] = df['RH'] * df['PRES2M']
    df['wspd_squared']= df['WSPD'] ** 2

    full_dates = pd.date_range(start=df['time'].min(), end=df['time'].max(), freq='D')
    
    df = df.copy()
    station_data_list = []
    for station_id in tqdm(df.ID.unique()):
        station_data = df[df.ID == station_id].copy()
        if len(station_data) == 0: continue
        # full_dates = pd.date_range(start=station_data['time'].min(), end=station_data['time'].max(), freq='D')
        station_data['time']=pd.to_datetime(station_data['time'])
        # station_data_daily = station_data.set_index('time').reindex(full_dates).rename_axis('time').reset_index()
        station_data_daily = station_data.set_index('time').rename_axis('time').reset_index()

        ### Preprocess time-dependent features
        for ft_name in station_data_daily.columns:
            if ft_name not in [
                'CO_column_number_density', 'Cloud', 'NO2_column_number_density',
                'O3_column_number_density', 'absorbing_aerosol_index',
            ]: continue
            if station_data_daily[ft_name].dtype not in ['float', 'int']:  continue
            if ft_name in ['pm25', 'lat', 'lon', 'time', 'ID']:  continue
            station_data_daily[f'{ft_name}_prev1'] = station_data_daily[ft_name].shift(1)
            station_data_daily[f'{ft_name}_next1'] = station_data_daily[ft_name].shift(-1)

        ### Gather station data
        station_data_list += [station_data_daily]

    df = pd.concat(station_data_list, axis=0)

    return df

def create_timeseries_data(df, window_size=16, show_tqdm=True):
    ### Assume these 2 vars have been setup
    global features, scaler
    full_dates = pd.date_range(start=df['time'].min(), end=df['time'].max(), freq='D')

    Xs = []
    ys = []
    pbar = df.ID.unique()
    if show_tqdm: pbar = tqdm(pbar)
    for station_id in pbar:
        station_data = df[df.ID == station_id].copy()
        if len(station_data) == 0: continue
        station_data['time']=pd.to_datetime(station_data['time'])
        station_data_daily = station_data.set_index('time').reindex(full_dates).rename_axis('time').reset_index()
    
        for i in range(len(station_data_daily) - window_size):
            # Do scaling stuff
            currX = scaler.transform(station_data_daily[features].values[i:i+window_size])
            curry = scaler.transform(station_data_daily[features].values[[i+window_size]])[0]

            # Let the pm25 be in the first column
            pm25_idx = features.index('pm25')
            curry = curry[[pm25_idx] + [i for i in range(len(features)) if i != pm25_idx]]
            
            if np.isnan(np.sum(currX)) or np.isnan(np.sum(curry)):
                continue
            Xs += [currX]
            ys += [curry]

    X = np.stack(Xs)
    y = np.stack(ys)
    return X, y

def create_timeseries_data_missing(df, window_size=8, show_tqdm=True):
    ### Assume these 2 vars have been setup
    global features, scaler
    full_dates = pd.date_range(start=df['time'].min(), end=df['time'].max(), freq='D')

    Xs = []
    ys = []
    pbar = data.ID.unique()
    if show_tqdm: pbar = tqdm(pbar)
    for station_id in pbar:
        station_data = df[df.ID == station_id].copy()
        if len(station_data) == 0: continue
        station_data['time']=pd.to_datetime(station_data['time'])
        # station_data_daily = station_data.set_index('time').reindex(full_dates).rename_axis('time').reset_index()
        station_data_daily = station_data.set_index('time').rename_axis('time').reset_index()

        
        currX = scaler.transform(station_data_daily[features].values)
        # currX[:, 1] = np.nanmean(currX[:, 1])
        # currX[:, 2] = np.nanmean(currX[:, 2])
        Xs += [currX[:, None]]
    Xs = np.concatenate(Xs, axis=1)
    return Xs

In [None]:
data_processed = preprocess_data(data)
data_processed.shape

In [None]:
val_date = '2021-06-01'
test_date = '2021-08-01'
train = data_processed[data_processed['time'] < val_date]
val = data_processed[(data_processed['time'] >= val_date) & (data_processed['time'] < test_date)]
test = data_processed[data_processed['time'] >= test_date]
#train = train.drop('ID',axis=1)
#train = train.drop('time',axis=1)


In [None]:
train.columns

In [None]:
features = [
    # 'time', 'ID',
    'pm25',
    'lat', 'lon',
    'sin_day', 'cos_day',
    'SQRT_SEA_DEM_LAT', 'WSPD', 'WDIR',
    'TMP', 'TX', 'TN', 'TP', 'RH', 'PRES2M',
    # 'pm25_lag1',
    # 'WDIR_x', 'WDIR_y',
    # 'day_of_year',
    'wind_u', 'wind_v',
    # 'temp_range',
    # 'day_of_week', 'month', 'season', 'is_weekend',
    'heat_index', 'dew_point', 'distance_to_hanoi', 'temp_wind', 'rh_pressure',
    # 'wspd_squared',
    'CO_column_number_density', 'Cloud', 'NO2_column_number_density',
    'O3_column_number_density', 'absorbing_aerosol_index',
]


### Fit a shared scaler on training data
scaler = MinMaxScaler()
scaler.fit(train[features].values)

In [None]:
selected_features = [
    "pm25",
    
    "SQRT_SEA_DEM_LAT",
    
    "TN", "dew_point", "heat_index", "TMP", "sin_day", "PRES2M",
    "distance_to_hanoi", "temp_wind", "cos_day", "TP", "TX", "wind_u",
    "rh_pressure",

    'CO_column_number_density', 'Cloud', 'NO2_column_number_density',
    'O3_column_number_density', 'absorbing_aerosol_index',
]
selected_features_indices = [i for i in range(len(features)) if features[i] in selected_features]
print(selected_features_indices)

# PyTorch Dataset

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import random

class TimeSeriesDataset(Dataset):
    def __init__(self, df=None, X_np=None, window_sizes=[16, 32, 64], batch_size=16):
        """
        Creates a dataset that precomputes timeseries data for multiple window sizes.
        
        Args:
            df (pd.DataFrame): The input dataframe.
            window_sizes (list): A list of integers specifying window sizes.
        """
        self.window_sizes = window_sizes
        self.batch_size = batch_size
        self.data_dict = {}
        if df is not None:
            self.df = df
            
            # Precompute timeseries data for each window size and store in a dictionary.
            for ws in tqdm(window_sizes):
                X, y = create_timeseries_data(df, window_size=ws, show_tqdm=False)
                self.data_dict[ws] = {'X': X, 'y': y}
        else:
            T, S, N = X_np.shape

            for ws in tqdm(window_sizes):
                Xs = []
                ys = []
                for s in range(S):
                    for t in range(T - 1):
                        if t + ws >= T: continue
                        if np.isnan(X_np[t:t+ws, s]).sum() or np.isnan(X_np[t + ws, s, :]).sum():
                            continue
                        Xs += [X_np[t:t+ws, s]]
                        ys += [X_np[t + ws, s, :]]
                Xs = np.stack(Xs)[:, :, selected_features_indices]
                ys = np.stack(ys)[:, selected_features_indices]
                self.data_dict[ws] = {'X': Xs, 'y': ys}

    def __len__(self):
        # Define length as the sum of all samples computed for all window sizes.
        total = 0
        for ws in self.window_sizes:
            total += len(self.data_dict[ws]['X'])
        return total // self.batch_size

    def __getitem__(self, dummy_index):
        if dummy_index > self.__len__(): raise StopIteration
        ws = random.choice(self.window_sizes)
        data = self.data_dict[ws]
        # Randomly sample an index from the chosen data.
        sample_idx = random.sample(range(len(data['X'])), k=self.batch_size)
        sample = {
            'X': data['X'][sample_idx],
            'y': data['y'][sample_idx],
        }
        return sample



# Implement LSTM

In [None]:
import torch
import torch.nn as nn

class MyRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.0, ft_weight=1.0):
        super(MyRNN, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.ft_weight = ft_weight
        
    def forward(self, x):
        # x shape: [batch, window_size, input_dim]
        # out shape: [batch, window_size, hidden_dim]
        out, _ = self.lstm(x)
        
        last_output = out[:, -1, :]  # shape: [batch, hidden_dim]
        output = self.fc(last_output)  # shape: [batch, output_dim]
        
        return output

    def compute_loss(self, x, y):
        y_pred = self.forward(x)
        loss = F.mse_loss(y_pred[:, 0], y[:, 0]) + self.ft_weight * F.mse_loss(y_pred, y)
        return loss

# Training Scripts

In [None]:
from sklearn.metrics import *
from copy import deepcopy
import time
def eval_rnn(model, dataset, detailed=False):
    tick = time.time()
    fin_loss_dict = {}
    for ws in dataset.data_dict.keys():
        X_full = dataset.data_dict[ws]['X']
        y_full = dataset.data_dict[ws]['y']
        batch_size = dataset.batch_size
        y_preds_all = []
        y_trues_all = []
        
        for i in range(0, len(X_full), batch_size):
            X_batch = torch.tensor(X_full[i:i+batch_size], device=args.device, dtype=torch.float32)
            y_batch = torch.tensor(y_full[i:i+batch_size], device=args.device, dtype=torch.float32)
            with torch.no_grad():
                y_pred = model.forward(X_batch).detach().cpu().numpy()
                y_true = y_batch.detach().cpu().numpy()
                y_pred_inv = (y_pred[:, 0] - scaler.min_[0]) / scaler.scale_[0]
                y_true_inv = (y_true[:, 0] - scaler.min_[0]) / scaler.scale_[0]
            y_preds_all.extend(y_pred_inv.tolist())
            y_trues_all.extend(y_true_inv.tolist())
        
        final_mse = mean_squared_error(y_preds_all, y_trues_all)
        final_mae = mean_absolute_error(y_preds_all, y_trues_all)
        final_r2 = r2_score(y_trues_all, y_preds_all)
        fin_loss_dict[ws] = {'mse': final_mse, 'mae': final_mae, 'r2': final_r2}
    # print(time.time() - tick)
    if detailed: return fin_loss_dict
    else:
        return -np.mean([fin_loss_dict[ws]['mse'] for ws in fin_loss_dict])

def train_rnn(model, train_dataset, val_dataset, test_dataset,
                num_epochs=10):
    torch.cuda.empty_cache()
    manual_seed(SEED)
    
    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    prev_eval_score = -9999.0
    best_eval_score = -9999.0
    iteration = 0
    mean_loss = -1
    best_ckpt_state_dict = deepcopy(model.state_dict())

    model.train().to(args.device)
    for epoch in range(num_epochs):
        print(f"Epoch {epoch}")
        pbar = tqdm(train_dataset, position=0, leave=True)
        for sample in pbar:
            X = sample['X']
            y = sample['y']
            X_tensor = torch.tensor(X, device=args.device, dtype=torch.float32)
            y_tensor = torch.tensor(y, device=args.device, dtype=torch.float32)
            
            optimizer.zero_grad()
            loss = model.compute_loss(X_tensor, y_tensor)
            loss.backward()

            total_norm = 0
            for p in model.parameters():
                try:
                    param_norm = p.grad.detach().data.norm(2)
                    total_norm += param_norm.item() ** 2
                except: pass
            total_norm = total_norm ** 0.5
            optimizer.step()
            
            if mean_loss is None or mean_loss < 0: mean_loss = loss.item()
            else: mean_loss = 0.9 * mean_loss + 0.1 * loss.item()


                
            if (iteration + 1) % 20 == 0:
                if best_eval_score > -9999:
                    pbar.set_description_str(
                    f"Loss: {mean_loss:.4f} | Best Val Score: {best_eval_score:.4f} | Val Score: {prev_eval_score:.4f}\t")
                else:
                    pbar.set_description_str(
                    f"Loss: {mean_loss:.4f}\t")
            if (iteration + 1) % (len(train_dataset) // 5) == 0:
                # Evaluate on test data.
                eval_score = eval_rnn(model, val_dataset)
                prev_eval_score = eval_score
                if eval_score > best_eval_score:
                    best_eval_score = eval_score
                    best_ckpt_state_dict = deepcopy(model.state_dict())
            iteration += 1
    model.load_state_dict(best_ckpt_state_dict)
    return model, best_eval_score

### Training args

In [None]:
from argparse import Namespace
import numpy as np

args = Namespace(
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    # Tắt gpu đi chạy 15 phút :v
    seed=1902,
    window_sizes = [1,2,3,4,5,7,14,28],
    # window_sizes = [1,2,3,4,5,7],
    batch_size=32,
    num_layers=2,
    dropout=0.0,
    hidden_dim=100,
    num_epochs=20,
    weight_decay=1e-6,
    lr=1e-3,
    ft_weight=0.0,
)

def manual_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    # if you are using GPU
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

manual_seed(args.seed)

### Setup datasets and train model

In [None]:
imputed_X_train_np = np.load('./imputed_data/imputed_X_train_np_dilate.npy')
imputed_X_val_np = np.load('./imputed_data/imputed_X_val_np.npy')
imputed_X_test_np = np.load('./imputed_data/imputed_X_test_np.npy')

In [None]:
train_dataset = TimeSeriesDataset(X_np=imputed_X_train_np, window_sizes=args.window_sizes, batch_size=args.batch_size)
val_dataset = TimeSeriesDataset(X_np=imputed_X_val_np, window_sizes=args.window_sizes, batch_size=args.batch_size)
test_dataset  = TimeSeriesDataset(X_np=imputed_X_test_np, window_sizes=args.window_sizes, batch_size=args.batch_size)

In [None]:
num_features = train_dataset.data_dict[args.window_sizes[0]]['X'].shape[-1]

In [None]:
rnn_model = MyRNN(num_features, args.hidden_dim, num_features,
                  num_layers=args.num_layers, dropout=args.dropout, ft_weight=args.ft_weight)
rnn_model, best_eval_score = train_rnn(rnn_model, train_dataset, val_dataset, test_dataset,
                num_epochs=args.num_epochs)

In [None]:
import json

val_results_json = json.dumps(eval_rnn(rnn_model, val_dataset, detailed=True), indent=2)
pd.read_json(val_results_json).T

In [None]:
import json

test_results_json = json.dumps(eval_rnn(rnn_model, test_dataset, detailed=True), indent=2)
pd.read_json(test_results_json).T