In [2]:
import wandb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
# Initialize the API
api = wandb.Api()

# Specify your project details
entity = "haeri-hsn"  # Replace with your wandb entity
project = "stream_learning"  # Replace with your wandb project name

# Define filters
filters = {
    'state': 'finished',  # Only fetch finished runs
    'tags': {'$in': ['teconer_final_v3']},  # Runs containing a specific tag msmsa_horizon_analysis_melbourne_housing \ msmsa_anchor_analysis_melbourne_housing
    # 'CreatedAt': {'$gt': '2024-7-9'}  # Only fetch runs created after a specific date
}

# Query runs with filters
runs = api.runs(f"{entity}/{project}", filters=filters)

# print number of runs
print(f"Number of runs: {len(runs)}")

Number of runs: 28


In [26]:
# Extract data from each run
summary_list = []
config_list = []
name_list = []
for run in runs:
    # run.summary contains the output of the training
    summary_list.append(run.summary._json_dict)

    # run.config contains the hyperparameters
    config_list.append({k: v for k, v in run.config.items() if not k.startswith('_')})

    # run.name is the name of the run.
    name_list.append(run.name)

# Convert to DataFrame
summary_df = pd.DataFrame.from_records(summary_list)
config_df = pd.DataFrame.from_records(config_list)
name_df = pd.DataFrame({'name': name_list})

# Combine all parts into one DataFrame
df = pd.concat([name_df, config_df, summary_df], axis=1)
# remove the columns that are repeated
df['preview_window'] = df['preview_window'].astype('float')/60
df = df.loc[:, ~df.columns.duplicated()]
# change the name of the metrics and remove metric_ from the name
df = df.rename(columns=lambda x: x.replace("metric_", ""))
# Display the DataFrame
print(df.columns)
df = df.dropna(subset=['preview_window'])
# print the values of the dataset (unique values)
print(df['dataset'].unique())
print(df['preview_window'].unique())
# drop with the preview_window = nan


list_of_files = ['Teconer_results/firm-elevator-1052.pkl', 'Teconer_results/jumping-gorge-1050.pkl', 'Teconer_results/vague-wind-1050.pkl', 'Teconer_results/dry-sky-1049.pkl']

for file_name in list_of_files:
    _ , summary = pd.read_pickle(file_name)
    new_row = pd.DataFrame({'R2': [summary['metric_R2']], 'RMSE': [summary['metric_RMSE']], 'MAE': [summary['metric_MAE']], 'dataset': [summary['dataset']], 'preview_window': [summary['preview_window']/60], 'epsilon': [.9]})
    # Adding a new row to the DataFrame
    # print(summary['preview_window'])
    df = pd.concat([df, new_row], ignore_index=True)

# print(summary1.keys())  

# two rows have the same dataset and preview_window and epsilon remove the one with the lower R2
df = df.sort_values('R2', ascending=False).drop_duplicates(['dataset', 'preview_window', 'epsilon'], keep='first')
# print the values of the dataset (unique values)

# print unique values of the preview_window


Index(['name', 'seed', 'tags', 'dataset', 'epsilon', 'wandb_log',
       'base_learner', 'online_model', 'RMSE', '_timestamp', 'MAE', 'TMI',
       'learning_model', '_wandb', 'training_size_log', '_runtime',
       'average_records_per_trip', 'R2', '_step', 'average_preview_records',
       'average_training_size', 'preview_window'],
      dtype='object')
['teconer_helsinki_jan2018' 'teconer_helsinki_jan2018_100K']
[ 5. 10. 30.  1.]
60
600
300
1800


## MAE, RMSE, R2 Barplots

In [59]:
%matplotlib qt
plt.close('all')
# make defaul font of the plots arial
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['font.family'] = "sans-serif"
# make the font size 12
plt.rcParams.update({'font.size': 13})

# datasets = list(df['dataset'].unique())
datasets = ['teconer_helsinki_jan2018_100K']

for dataset in datasets:
    df_ = df[(df['dataset'] == dataset)]
    print(df_['epsilon'].unique())
    print(df_['preview_window'].unique())
    f, axs = plt.subplots(1, 3, figsize=(11, 5))
    for i, metric in enumerate(['R2', 'RMSE', 'MAE']):
        # plot barplot of R2 scores for different epsilon values and different preview durations
        sns.set_theme(style="whitegrid")
        # Try using .loc[row_indexer,col_indexer] = value instead
        sns.barplot(data=df_, x="epsilon", y=metric, hue="preview_window", errorbar="sd", palette="coolwarm", ax=axs[i])
        axs[i].set_xlabel("epsilon")
        axs[i].set_ylabel(metric)
        # rename legend title
        if i == 1:
            axs[i].legend(title="Preview [min]", ncol=df_['preview_window'].nunique(), loc='upper center', bbox_to_anchor=(0.5, 1.3))
        else:
            axs[i].get_legend().remove()
        
        if metric == 'R2':
            axs[i].set_ylim(0, 1)
        if metric == 'RMSE':
            axs[i].set_ylim(0, .14)
        if metric == 'MAE':
            axs[i].set_ylim(0, .09)
        # print(df_.groupby(['epsilon', 'preview_window'])[metric].mean())
        # add plot title below the axis


    # f.set_size_inches(12, 5)
    # f.suptitle(dataset)
    # increase the space between the plots
    plt.subplots_adjust(wspace=0.3, hspace=0.5, top=0.80, bottom=0.15, left=0.07, right=.98)
    # plt.tight_layout()
    # save to \plots folder
    plt.savefig(f"teconer_metrics_{dataset}.pdf")
    
plt.show()





[0.95 0.55 0.7 ]
[ 1.  5. 10. 30.]


In [55]:
%matplotlib qt
plt.close('all')


# datasets = list(df['dataset'].unique())
datasets = ['teconer_helsinki_jan2018']

for dataset in datasets:
    df_ = df[(df['dataset'] == dataset)]
    # f, axs = plt.subplots(1, 1, figsize=(5, 5))
    
    for i, metric in enumerate(['R2', 'RMSE', 'MAE']):
        plt.figure(dataset, figsize=(5, 6))
        # plot barplot of R2 scores for different epsilon values and different preview durations
        sns.set_theme(style="whitegrid")
        # Try using .loc[row_indexer,col_indexer] = value instead
        sns.barplot(data=df_, x="epsilon", y=metric, hue="preview_window", errorbar="sd", palette="coolwarm")
        plt.xlabel("epsilon")
        plt.ylabel(metric)
        # rename legend title
        # if i == 1:
        plt.legend(title="Preview [min]", ncol=df_['preview_window'].nunique(), loc='upper center', bbox_to_anchor=(0.5, 1.3))
        # else:
            # axs[i].get_legend().remove()
        
    # # f.set_size_inches(12, 5)
    # # f.suptitle(dataset)
    # # increase the space between the plots
    # plt.subplots_adjust(wspace=0.5, hspace=0.5, top=0.80, bottom=0.2)
        plt.tight_layout()
        # save to \plots folder
        plt.savefig(f"teconer_{dataset}_{metric}.svg")
    
plt.show()

In [19]:
%matplotlib qt


# print average_ahead_records for each dataset, and preview_duration in a table
df_ = df.groupby(['dataset', 'preview_window'])['average_preview_records'].mean().reset_index()
df_ = df_.pivot(index='dataset', columns='preview_window', values='average_preview_records')
df_ = df_.fillna(0)
# df_ = df_.sort_values(by='dataset', ascending=False)
# sort df_ based on the dataset names (given in the list)
# df_ = df_.reindex(['Teconer_Jan_10K', 'Teconer_Jan_100K', 'Teconer_Jan_1M','Teconer_Downtown_10K', 'Teconer_Downtown_100K', 'Teconer_Downtown_1M'])
df_ = df_.round(2)
sns.heatmap(df_, annot=True, cmap="coolwarm", fmt=".2f", cbar=False)
plt.title('Average number of queried preview points')
plt.xlabel('Preview [min]')
plt.ylabel('Dataset')
plt.tight_layout()
plt.show()


# print df 
print(df_)



preview_window                  1.0     5.0     10.0     30.0
dataset                                                      
teconer_helsinki_jan2018       52.04  244.44  458.18  1097.23
teconer_helsinki_jan2018_100K   3.77   14.26   26.18    60.28


In [147]:
# make a column 'Downtown' and 'Jan 2018' based on the dataset name
df['location'] = df['dataset'].apply(lambda x: 'Downtown' if 'Downtown' in x else 'Jan 2018')

# also make a column '10K', '100K', '1M' based on the dataset name
df['size'] = df['dataset'].apply(lambda x: '10K' if '10K' in x else ('100K' if '100K' in x else '1M'))

# plot a heatmap of average trip duration for different size values and location
df_ = df.groupby(['location', 'size'])['average_records_per_trip'].mean().reset_index()
df_ = df_.pivot(index='location', columns='size', values='average_records_per_trip')
df_ = df_.fillna(0)
df_ = df_.sort_values(by='location', ascending=False)
df_ = df_.round(2)
sns.heatmap(df_, annot=True, cmap="coolwarm", fmt=".2f", cbar=False)
plt.title('Average records per trip')
plt.xlabel('Number of Samples in Dataset')
plt.ylabel('')
plt.tight_layout()
plt.show()

# print the values in a table
print(df_)



size       100K    10K      1M
location                      
Jan 2018  95.88  11.25  956.94
Downtown  20.28   2.93  200.46
