In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split, TensorDataset

import datetime
import os

import optuna
import torch.optim as optim

import json

In [2]:
# build the gradient boosting regression model with PyTorch
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out
    
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()
        
    def forward(self, outputs, labels):
        # Define your custom loss function here
        # This is a simple example, replace it with your own function
        custom_loss = 3*(torch.abs(labels - outputs))*(torch.abs(labels - 1/3) + torch.abs(labels -2/3))
        return custom_loss
    
def objective(trial):
    # Sample hyperparameters
    input_size = 8
    hidden_size = trial.suggest_int('hidden_size', 3, 10000, log=True)  # Single hidden size for all layers
    output_size = 1
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e1, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.3)

    # Instantiate the model with sampled hyperparameters
    model = Net(input_size, hidden_size, output_size, dropout_rate)

    # Define the loss function and optimizer
    criterion = CustomLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(100):
        training_loss = 0.0
        eval_loss = 0.0

        # Iterate over the DataLoader for training data
        for i, data in enumerate(train_loader):
            inputs, labels = data
            inputs = inputs.float()
            labels = labels.float()
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.reshape(-1)
            loss = criterion(outputs, labels).sum()
            loss.backward()
            optimizer.step()
            training_loss += loss.item()

        model.eval()
        # Iterate over the DataLoader for test data
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                inputs, labels = data
                inputs = inputs.float()
                labels = labels.float()
                outputs = model(inputs).reshape(-1)
                loss = criterion(outputs, labels).sum()
                eval_loss += loss

        # Optuna logs the running loss for each epoch
        trial.report(eval_loss, epoch)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    # Return the running loss as the objective value to minimize
    return eval_loss


class station_model():
    def __init__(self, station):
        self.station = station

    def get_data_loader(self):
        df = pd.read_parquet(f'parquets/{self.station}.parquet')
        TOT = df['tot'].iloc[0]
        df['sbi'] = df['sbi']/TOT
        df['time'] = df['time']/1440

        # x is dataset without 'sbi', y is 'sbi'
        X = df.drop(['tot', 'sbi','bemp' ,'act', 'tot', 'station'], axis=1)
        y = df['sbi']

        X = X.to_numpy()
        y = y.to_numpy()

        X = torch.from_numpy(X)
        y = torch.from_numpy(y)
        dataset = TensorDataset(X, y)

        # get train, test loader
        self.all_data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

        # split train, test
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

        # get train, test loader
        self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
        self.test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)
        print(f'get data loader for {self.station}')

    def objective(self,trial):
        # Sample hyperparameters
        input_size = 8
        hidden_size = trial.suggest_int('hidden_size', 3, 5000, log=True)  # Single hidden size for all layers
        output_size = 1
        learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e1, log=True)
        dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)

        # Instantiate the model with sampled hyperparameters
        model = Net(input_size, hidden_size, output_size, dropout_rate)

        # Define the loss function and optimizer
        criterion = CustomLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Training loop
        for epoch in range(100):
            training_loss = 0.0
            eval_loss = 0.0
            # Iterate over the DataLoader for training data
            for i, data in enumerate(self.train_loader):
                inputs, labels = data
                inputs = inputs.float()
                labels = labels.float()
                optimizer.zero_grad()
                outputs = model(inputs)
                outputs = outputs.reshape(-1)
                loss = criterion(outputs, labels).sum()
                loss.backward()
                optimizer.step()
                training_loss += loss.item()

            model.eval()
            # Iterate over the DataLoader for test data
            with torch.no_grad():
                for i, data in enumerate(self.test_loader):
                    inputs, labels = data
                    inputs = inputs.float()
                    labels = labels.float()
                    outputs = model(inputs).reshape(-1)
                    loss = criterion(outputs, labels).sum()
                    eval_loss += loss

            # Optuna logs the running loss for each epoch
            trial.report(eval_loss, epoch)
            # Handle pruning based on the intermediate value
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
        # Return the running loss as the objective value to minimize
        return eval_loss      


    def get_best_param(self):
        # Create an Optuna Study
        study = optuna.create_study(direction='minimize', storage='sqlite:///db.sqlite3', study_name=f'{self.station}', load_if_exists=True)

        # Run the optimization process
        study.optimize(self.objective, n_trials=10)

        print(f'{self.station} eval_loss: {study.best_trial.value}')

        # Access the best hyperparameters
        best_params = study.best_trial.params

        self.best_params = best_params

        print(self.station,' Best hyperparameters:', best_params)

    def train(self):
        # Instantiate the final model with the best hyperparameters
        final_model = Net(8, self.best_params['hidden_size'], 1, self.best_params['dropout_rate'])

        # ... rest of your training code for the final model
        criterion = CustomLoss()
        optimizer = optim.Adam(final_model.parameters(), lr=self.best_params['learning_rate'])

        # train with whole dataset
        running_loss = 0.0
        for epoch in range(10):
            for i, data in enumerate(self.all_data_loader):
                inputs, labels = data
                inputs = inputs.float()
                labels = labels.float()
                optimizer.zero_grad()
                outputs = final_model(inputs)
                outputs = outputs.squeeze()
                loss = criterion(outputs, labels).sum()
                loss.backward()
                optimizer.step()
                running_loss += loss.item()/128
            print(f'epoch: {epoch+1}, loss: {running_loss/self.all_data_loader.__len__()}')
            running_loss = 0.0

        final_model.eval()
        # save model
        torch.save(final_model.state_dict(), f'models/model_{self.station}.pt')

def get_one_hot_weekday(date_str):
    # Convert the date string to a datetime object
    date = datetime.datetime.strptime(date_str, '%Y%m%d')
    
    # Get the weekday (Monday is 0, Sunday is 6)
    weekday = date.weekday()
    
    # Create a one-hot encoded list for the weekday
    one_hot_weekday = [1 if i == weekday else 0 for i in range(7)]
    
    return one_hot_weekday

def time_to_minute(time_str):
    hours, minutes = map(int, time_str.split(':'))
    total_minutes = hours * 60 + minutes
    return total_minutes



In [9]:
# create submission data

df = pd.read_csv('html.2023.final.data/sample_submission_stage1.csv')
# ['time', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
submission = pd.DataFrame(columns=['id','time', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
stationTOT = {}
rows = []
for x in df['id']:
    TOT = 1
    date = x[:8]
    station = x[9:18]
    time = x[19:]
    one_hot_weekday = get_one_hot_weekday(date)
    time = time_to_minute(time)/1440
    row = {'id': x,'time':time, 'mon':one_hot_weekday[0], 'tue':one_hot_weekday[1], 'wed':one_hot_weekday[2], 'thu':one_hot_weekday[3], 'fri':one_hot_weekday[4], 'sat':one_hot_weekday[5], 'sun':one_hot_weekday[6], f'{station}':1}
    rows.append(row)

submission = pd.DataFrame(rows)
submission = submission.fillna(0)

In [12]:
submission = submission.iloc[:,:44]
submission

Unnamed: 0,id,time,mon,tue,wed,thu,fri,sat,sun,500101001,...,500101030,500101031,500101032,500101033,500101034,500101035,500101036,500101037,500101038,500101039
0,20231204_500101001_00:00,0.000000,1,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20231204_500101001_00:20,0.013889,1,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20231204_500101001_00:40,0.027778,1,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20231204_500101001_01:00,0.041667,1,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20231204_500101001_01:20,0.055556,1,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88699,20231024_500119091_22:20,0.930556,0,1,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88700,20231024_500119091_22:40,0.944444,0,1,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88701,20231024_500119091_23:00,0.958333,0,1,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88702,20231024_500119091_23:20,0.972222,0,1,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
station_tot = {}
print(submission.columns)
for station in submission.columns[9:]:
    find_tot = pd.read_json(f'html.2023.final.data/release/20231002/{station}.json', convert_axes=False)
    find_tot = find_tot.transpose()
    for n in find_tot['tot'].to_numpy():
        try:
            m = int(n)
            TOT = n
            break
        except:
            continue
    station_tot[station] = TOT

Index(['id', 'time', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
       '500101001', '500101002', '500101003', '500101004', '500101005',
       '500101006', '500101007', '500101008', '500101009', '500101010',
       '500101013', '500101014', '500101015', '500101018', '500101019',
       '500101020', '500101021', '500101022', '500101023', '500101024',
       '500101025', '500101026', '500101027', '500101028', '500101029',
       '500101030', '500101031', '500101032', '500101033', '500101034',
       '500101035', '500101036', '500101037', '500101038', '500101039'],
      dtype='object')


In [116]:
model = torch.load('models/all')
model.eval()
#Net(
#   (dropout): Dropout(p=0.30438527094835316, inplace=False)
#   (fc1): Linear(in_features=43, out_features=538, bias=True)
#   (fc2): Linear(in_features=538, out_features=538, bias=True)
#   (fc3): Linear(in_features=538, out_features=538, bias=True)
#   (fc4): Linear(in_features=538, out_features=538, bias=True)
#   (fc5): Linear(in_features=538, out_features=1, bias=True)
#   (relu): ReLU()
# )
df_orig_predict = pd.read_csv('predict/prediction_round.csv')
row = []

i= 0
# iterate over each row in submission 
for rows in submission.iterrows():
    # get first value of row
    id = rows[1][0]
    staion = id[9:18]
    x = rows[1][1:]
    x = x.values
    x = x.astype(np.float32)
    x = torch.tensor(x)
    x = x.reshape(1, -1)
    pred = model(x)
    pred = pred.item()*station_tot[station]
    row.append([id, pred])
    # remove row in df_orig_predict with id == id
    df_orig_predict = df_orig_predict[df_orig_predict['id'] != id]
    # i += 1
    # print(i)

# append row to df_orig_predict
row = pd.DataFrame(row, columns=['id', 'sbi'])
df_orig_predict = df_orig_predict.append(row)
df_orig_predict = df_orig_predict.sort_values(by=['id'])
df_orig_predict.to_csv('submission.csv')

  id = rows[1][0]


RuntimeError: a Tensor with 36 elements cannot be converted to Scalar