In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split, TensorDataset

import datetime
import os

import optuna
import torch.optim as optim

In [2]:
# build the gradient boosting regression model with PyTorch
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out
    
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()
        
    def forward(self, outputs, labels):
        # Define your custom loss function here
        # This is a simple example, replace it with your own function
        custom_loss = 3*(torch.abs(labels - outputs))*(torch.abs(labels - 1/3) + torch.abs(labels -2/3))
        return custom_loss
    
def objective(trial):
    # Sample hyperparameters
    input_size = 8
    hidden_size = trial.suggest_int('hidden_size', 3, 10000, log=True)  # Single hidden size for all layers
    output_size = 1
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e1, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.3)

    # Instantiate the model with sampled hyperparameters
    model = Net(input_size, hidden_size, output_size, dropout_rate)

    # Define the loss function and optimizer
    criterion = CustomLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(100):
        training_loss = 0.0
        eval_loss = 0.0

        # Iterate over the DataLoader for training data
        for i, data in enumerate(train_loader):
            inputs, labels = data
            inputs = inputs.float()
            labels = labels.float()
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.reshape(-1)
            loss = criterion(outputs, labels).sum()
            loss.backward()
            optimizer.step()
            training_loss += loss.item()

        model.eval()
        # Iterate over the DataLoader for test data
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                inputs, labels = data
                inputs = inputs.float()
                labels = labels.float()
                outputs = model(inputs).reshape(-1)
                loss = criterion(outputs, labels).sum()
                eval_loss += loss

        # Optuna logs the running loss for each epoch
        trial.report(eval_loss, epoch)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    # Return the running loss as the objective value to minimize
    return eval_loss


class station_model():
    def __init__(self, station):
        self.station = station

    def get_data_loader(self):
        df = pd.read_parquet(f'parquets/{self.station}.parquet')
        TOT = df['tot'].iloc[0]
        df['sbi'] = df['sbi']/TOT
        df['time'] = df['time']/1440

        # x is dataset without 'sbi', y is 'sbi'
        X = df.drop(['tot', 'sbi','bemp' ,'act', 'tot', 'station'], axis=1)
        y = df['sbi']

        X = X.to_numpy()
        y = y.to_numpy()

        X = torch.from_numpy(X)
        y = torch.from_numpy(y)
        dataset = TensorDataset(X, y)

        # get train, test loader
        self.all_data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

        # split train, test
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

        # get train, test loader
        self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
        self.test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)
        print(f'get data loader for {self.station}')

    def objective(self,trial):
        # Sample hyperparameters
        input_size = 8
        hidden_size = trial.suggest_int('hidden_size', 3, 5000, log=True)  # Single hidden size for all layers
        output_size = 1
        learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e1, log=True)
        dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)

        # Instantiate the model with sampled hyperparameters
        model = Net(input_size, hidden_size, output_size, dropout_rate)

        # Define the loss function and optimizer
        criterion = CustomLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Training loop
        for epoch in range(100):
            training_loss = 0.0
            eval_loss = 0.0
            # Iterate over the DataLoader for training data
            for i, data in enumerate(self.train_loader):
                inputs, labels = data
                inputs = inputs.float()
                labels = labels.float()
                optimizer.zero_grad()
                outputs = model(inputs)
                outputs = outputs.reshape(-1)
                loss = criterion(outputs, labels).sum()
                loss.backward()
                optimizer.step()
                training_loss += loss.item()

            model.eval()
            # Iterate over the DataLoader for test data
            with torch.no_grad():
                for i, data in enumerate(self.test_loader):
                    inputs, labels = data
                    inputs = inputs.float()
                    labels = labels.float()
                    outputs = model(inputs).reshape(-1)
                    loss = criterion(outputs, labels).sum()
                    eval_loss += loss

            # Optuna logs the running loss for each epoch
            trial.report(eval_loss, epoch)
            # Handle pruning based on the intermediate value
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
        # Return the running loss as the objective value to minimize
        return eval_loss      


    def get_best_param(self):
        # Create an Optuna Study
        study = optuna.create_study(direction='minimize', storage='sqlite:///db.sqlite3', study_name=f'{self.station}', load_if_exists=True)

        # Run the optimization process
        study.optimize(self.objective, n_trials=10)

        print(f'{self.station} eval_loss: {study.best_trial.value}')

        # Access the best hyperparameters
        best_params = study.best_trial.params

        self.best_params = best_params

        print(self.station,' Best hyperparameters:', best_params)

    def train(self):
        # Instantiate the final model with the best hyperparameters
        final_model = Net(8, self.best_params['hidden_size'], 1, self.best_params['dropout_rate'])

        # ... rest of your training code for the final model
        criterion = CustomLoss()
        optimizer = optim.Adam(final_model.parameters(), lr=self.best_params['learning_rate'])

        # train with whole dataset
        running_loss = 0.0
        for epoch in range(10):
            for i, data in enumerate(self.all_data_loader):
                inputs, labels = data
                inputs = inputs.float()
                labels = labels.float()
                optimizer.zero_grad()
                outputs = final_model(inputs)
                outputs = outputs.squeeze()
                loss = criterion(outputs, labels).sum()
                loss.backward()
                optimizer.step()
                running_loss += loss.item()/128
            print(f'epoch: {epoch+1}, loss: {running_loss/self.all_data_loader.__len__()}')
            running_loss = 0.0

        final_model.eval()
        # save model
        torch.save(final_model.state_dict(), f'models/model_{self.station}.pt')

In [4]:
import torch

folder_path = 'models/'  # replace with your actual folder path
trained_models = os.listdir(folder_path)

# read models
for model_file in trained_models:
        
    # Load the study
    station = model_file
    study = optuna.load_study(study_name=station, storage="sqlite:///db.sqlite3")

    df = pd.read_parquet(f'parquets/{station}.parquet')
    TOT = df['tot'].iloc[0]
    df['sbi'] = df['sbi']/TOT
    df['time'] = df['time']/1440

    # x is dataset without 'sbi', y is 'sbi'
    X = df.drop(['tot', 'sbi','bemp' ,'act', 'tot', 'station'], axis=1)
    y = df['sbi']

    X = X.to_numpy()
    y = y.to_numpy()

    X = torch.from_numpy(X)
    y = torch.from_numpy(y)
    dataset = TensorDataset(X, y)

    # get train, test loader
    all_data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

    # split train, test
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    # get train, test loader
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)


    # Get the best parameters
    best_params = study.best_params

    hidden_size = best_params["hidden_size"]
    dropout_rate = best_params["dropout_rate"]
    model = Net(8, hidden_size, 1, dropout_rate)
    print(dropout_rate, hidden_size, model_file)
    
    # Load the model
    model_path = os.path.join(folder_path, model_file)
    model.load_state_dict(torch.load(model_path))
    model.eval()


    criterion = CustomLoss()
    eval_loss = 0.0

    # Make predictions
    for i, data in enumerate(test_loader):
        inputs, labels = data
        inputs = inputs.float()
        labels = labels.float()
        outputs = model(inputs).reshape(-1)
        loss = criterion(outputs, labels).sum()
        eval_loss += loss/128
    print(f'{station} eval_loss: {eval_loss/ test_loader.__len__()}')
    model.eval()
    torch.save(model, f'model_true/{station}')


    

0.2550613376388636 112 500101002


TypeError: Expected state_dict to be dict-like, got <class '__main__.Net'>.

In [None]:
model = torch.load('model_true/500101001.pth')
model.eval()
print(model)

# Load the study
df = pd.read_parquet(f'parquets/500101001.parquet')
TOT = df['tot'].iloc[0]
df['sbi'] = df['sbi']/TOT
df['time'] = df['time']/1440

# x is dataset without 'sbi', y is 'sbi'
X = df.drop(['tot', 'sbi','bemp' ,'act', 'tot', 'station'], axis=1)
print(X.columns)
y = df['sbi']

X = X.to_numpy()
y = y.to_numpy()

X = torch.from_numpy(X)
y = torch.from_numpy(y)
dataset = TensorDataset(X, y)

# get train, test loader
all_data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

# split train, test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# get train, test loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)

eval_loss = 0.0
for i, data in enumerate(test_loader):
    inputs, labels = data
    inputs = inputs.float()
    labels = labels.float()
    outputs = model(inputs).reshape(-1)
    loss = criterion(outputs, labels).sum()
    eval_loss += loss

print(f'{station} eval_loss: {eval_loss/test_size}')

Net(
  (fc1): Linear(in_features=8, out_features=510, bias=True)
  (dropout): Dropout(p=0.3103772369753493, inplace=False)
  (fc2): Linear(in_features=510, out_features=1, bias=True)
  (relu): ReLU()
)
Index(['time', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'], dtype='object')
500101014 eval_loss: 0.2925603985786438


In [None]:
import numpy as np

available_station = np.loadtxt('html.2023.final.data/sno_test_set.txt', dtype='str')
# available_station = available_station[:35]
folder_path = 'models/'  # replace with your actual folder path
trained_models = os.listdir(folder_path)
available_station = np.setdiff1d(available_station, trained_models)

# reverse available_station
available_station = available_station[::-1]
available_station

array(['500119091', '500119090', '500119089', '500119088', '500119087',
       '500119086', '500119085', '500119084', '500119083', '500119082',
       '500119081', '500119080', '500119079', '500119078', '500119077',
       '500119076', '500119075', '500119074', '500119072', '500119071',
       '500119070', '500119069', '500119068', '500119067', '500119066',
       '500119065', '500119064', '500119063', '500119062', '500119061',
       '500119060', '500119059', '500119058', '500119057', '500119056',
       '500119055', '500119054', '500119053', '500119052', '500119051',
       '500119050', '500119049', '500119048', '500119047', '500119046',
       '500119045', '500119044', '500119043', '500106004', '500106003',
       '500106002', '500105066', '500101219', '500101216', '500101209',
       '500101199', '500101193', '500101191', '500101190', '500101189',
       '500101188', '500101185', '500101184', '500101181', '500101176',
       '500101175', '500101166', '500101123', '500101115', '5001