In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import os
import json
from train_and_test import *


In [3]:
df = pd.read_csv('../data/sample_submission.csv')
df.head()
df.tail()

Unnamed: 0,id,sbi
88699,20231210_500119091_22:20,1.0
88700,20231210_500119091_22:40,1.0
88701,20231210_500119091_23:00,1.0
88702,20231210_500119091_23:20,1.0
88703,20231210_500119091_23:40,1.0


In [4]:
test_station = sorted(set([df["id"][i].split("_")[1] for i in range(len(df))]))

In [2]:
def get_data(start_date, end_date):
    # get data from "release"
    start_date = datetime.strptime(start_date, "%Y%m%d")
    end_date = datetime.strptime(end_date, "%Y%m%d")

    # Iterate through the range of dates
    data_path = "../..//html.2023.final.data/release/"
    test_data_dic = {}
    current_date = start_date
    while current_date <= end_date:
        folder_path = os.path.join(data_path, current_date.strftime("%Y%m%d"))
        for station in test_station:
            file_path = os.path.join(folder_path, station) + ".json"
            with open(file_path, 'r') as f:
                file = json.load(f)
                current_time = datetime.strptime("00:00", "%H:%M")
                end_time = datetime.strptime("23:59", "%H:%M")
                while current_time <= end_time:
                    time = current_time
                    while file[time.strftime("%H:%M")] == {}:
                        time += timedelta(minutes=1)
                    key = current_date.strftime("%Y%m%d") + "_" + station + "_" + current_time.strftime("%H:%M")
                    value = file[time.strftime("%H:%M")]
                    test_data_dic[key] = value
                    current_time += timedelta(minutes=20)
        current_date += timedelta(days=1)
    return test_data_dic

In [3]:
def validation(start_date, end_date, submission):
    ans = get_data(start_date, end_date)
    private_index = get_test_index(start_date, end_date)
    public_index = get_test_index("20231021", "20231024")
    sub = dict(zip(submission['id'], submission['sbi']))
    if set(private_index + public_index) != set(sub.keys()):
        raise ValueError("Dictionaries do not have the same keys.")

    E_val = []
    try: 
        for key in private_index:
            b_it = ans[key]["sbi"]
            b_it_pred = sub[key]
            s_i = ans[key]["tot"]
            E_val.append(3 * abs((b_it -  b_it_pred) / s_i) * (abs(b_it / s_i - 1 / 3) + abs(b_it / s_i - 2 / 3)))
    except: 
        raise ValueError("Inappropriate value detected.")
        
    return np.average(E_val)

In [4]:
def aggregation(start_date, end_date, sub1, sub2, agg_1, agg_2):
    ans = get_data(start_date, end_date)
    private_index = get_test_index(start_date, end_date)
    sub1 = dict(zip(sub1['id'], sub1['sbi']))
    sub2 = dict(zip(sub2['id'], sub2['sbi']))
    E_val = []
    for key in private_index:
        b_it = ans[key]["sbi"]
        b_it_pred = sub1[key] * agg_1 + sub2[key] * agg_2
        s_i = ans[key]["tot"]
        E_val.append(3 * abs((b_it -  b_it_pred) / s_i) * (abs(b_it / s_i - 1 / 3) + abs(b_it / s_i - 2 / 3)))
    return np.average(E_val)

### Ryan

In [28]:
sub_r = pd.read_csv('../test_result/group_validation.csv')
sub_r

Unnamed: 0,id,sbi
0,20231123_500101001_00:00,12.762881
1,20231123_500101001_00:20,10.162225
2,20231123_500101001_00:40,9.641389
3,20231123_500101001_01:00,7.450892
4,20231123_500101001_01:20,7.783185
...,...,...
88699,20231024_500119091_22:20,1.725183
88700,20231024_500119091_22:40,2.526397
88701,20231024_500119091_23:00,2.282500
88702,20231024_500119091_23:20,1.761423


In [16]:
validation("20231123", "20231129", sub_r)

0.3128317247535847

### Liang

In [29]:
sub_l = pd.read_csv('../test_result/submission_2nd.csv')
sub_l

Unnamed: 0,id,sbi
0,20231123_500119045_00:00,9.833200
1,20231123_500119045_00:20,9.833200
2,20231123_500119045_00:40,9.833200
3,20231123_500119045_01:00,9.833200
4,20231123_500119045_01:20,9.833200
...,...,...
56443,20231129_500119062_22:20,7.243104
56444,20231129_500119062_22:40,7.243104
56445,20231129_500119062_23:00,7.243104
56446,20231129_500119062_23:20,7.243104


In [22]:
validation("20231123", "20231129", sub_l)

0.7965220591020103

### MingYou

In [30]:
sub_m = pd.read_csv('../test_result/submit_mingyou.csv')
sub_m

Unnamed: 0,id,sbi
0,20231123_500101001_00:00,12.639576
1,20231123_500101001_00:20,12.639576
2,20231123_500101001_00:40,12.639576
3,20231123_500101001_01:00,13.961031
4,20231123_500101001_01:20,13.961031
...,...,...
88699,20231024_500119091_22:20,0.259925
88700,20231024_500119091_22:40,0.259925
88701,20231024_500119091_23:00,1.204378
88702,20231024_500119091_23:20,1.204378


In [26]:
validation("20231123", "20231129", sub_m)

0.42116666637559774

In [49]:
sub_m_tuned = pd.read_csv('../test_result/submit_xgb_tune.csv')
sub_m_tuned

Unnamed: 0,id,sbi
0,20231123_500101001_00:00,1.762254
1,20231123_500101001_00:20,1.762254
2,20231123_500101001_00:40,5.229442
3,20231123_500101001_01:00,5.160602
4,20231123_500101001_01:20,5.160602
...,...,...
88699,20231024_500119091_22:20,0.202084
88700,20231024_500119091_22:40,1.213326
88701,20231024_500119091_23:00,2.099670
88702,20231024_500119091_23:20,2.099670


In [50]:
validation("20231123", "20231129", sub_m_tuned)

0.4256876193016058

### Aggregation

In [43]:
for i in np.arange(0, 1, 0.1):
    sub1 = pd.read_csv('../submission/phase_3/agg_rf_bestfeat_phase1.csv')
    sub2 = pd.read_csv('../submission/phase_3/agg_rf_oobsfeat_phase1.csv')
    print(i, aggregation("20231204", "20231210", sub1, sub2, i, 1-i))

0.0 0.3699454694179275
0.1 0.36010526918560576
0.2 0.35171677762353926
0.30000000000000004 0.34469985152513455
0.4 0.3390128077234059
0.5 0.33456638432209035
0.6000000000000001 0.3313081929860933
0.7000000000000001 0.3291834921761402
0.8 0.3282780291070895
0.9 0.3286366291335551


### current test score

In [20]:
def current_test_score(df, end_date, end_hr, end_minute, dir):
    x_train, features = feature_selection(df[test_stations])
    df_train = pd.DataFrame(np.vstack(x_train), columns=features)
    df_train = df_train[df_train["date"] <= datetime(2023, 12, datetime.strptime(end_date, "%Y%m%d").day, end_hr, end_minute)].reset_index(drop = True)
    df_train['id'] = df_train['date'].dt.strftime('%Y%m%d_') + df_train['station_id'] + '_' + df_train['date'].dt.strftime('%H:%M')
    actual = {}
    for i in range(len(df_train)):
        actual[df_train["id"][i]] = {"sbi": df_train["sbi"][i], "tot": df_train["tot"][i]}
    sub_r = pd.read_csv(dir)
    pred = sub_r.set_index('id')['sbi'].to_dict()
    E_val = []
    for key, val in actual.items():
        b_it = val['sbi']
        b_it_pred = pred[key]
        s_i = val['tot']
        E_val.append((abs(b_it -  b_it_pred)))
        # E_val.append(3 * abs((b_it -  b_it_pred) / s_i) * (abs(b_it / s_i - 1 / 3) + abs(b_it / s_i - 2 / 3)))
    return (np.average(E_val))

In [22]:
df = get_train_data("20231204", "20231210", test_stations)

2023-12-04 00:00:00
2023-12-05 00:00:00
2023-12-06 00:00:00
2023-12-07 00:00:00
2023-12-08 00:00:00
2023-12-09 00:00:00
2023-12-10 00:00:00


In [23]:
current_test_score(df, "20231210", 23, 59, '../submission/phase_1/phase1_final_submission.csv')

4.025622198355378

In [21]:
current_test_score(df, "20231217", 23, 59, '../submission/phase_2/phase2_final_submission.csv')

3.8720727937003567

In [9]:
current_test_score(df, "20231224", 23, 59, '../submission/phase_3/final_submission_82.csv')

0.45199071070358277

In [10]:
current_test_score(df, "20231224", 23, 59, '../submission/phase_3/final_submission_55.csv')

0.45159971996397796

In [11]:
current_test_score(df, "20231224", 23, 59,'../submission/phase_3/phase3_final_agg_weekday.csv')

0.4735661612433877

In [12]:
current_test_score(df, "20231224", 23, 59,'../submission/phase_3/phase3_final_agg_fulemp.csv')

0.4499189200814097