In [99]:
import wandb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [100]:
# Initialize the API
api = wandb.Api()

# Specify your project details
entity = "haeri-hsn"  # Replace with your wandb entity
project = "stream_learning"  # Replace with your wandb project name

# Define filters
filters = {
    'state': 'finished',  # Only fetch finished runs
    'tags': {'$in': ['teconer_final']}  # Runs containing a specific tag msmsa_horizon_analysis_melbourne_housing \ msmsa_anchor_analysis_melbourne_housing
}

# Query runs with filters
runs = api.runs(f"{entity}/{project}", filters=filters)

# print number of runs
print(f"Number of runs: {len(runs)}")

Number of runs: 452


In [101]:
# Extract data from each run
summary_list = []
config_list = []
name_list = []
for run in runs:
    # run.summary contains the output of the training
    summary_list.append(run.summary._json_dict)

    # run.config contains the hyperparameters
    config_list.append({k: v for k, v in run.config.items() if not k.startswith('_')})

    # run.name is the name of the run.
    name_list.append(run.name)

# Convert to DataFrame
summary_df = pd.DataFrame.from_records(summary_list)
config_df = pd.DataFrame.from_records(config_list)
name_df = pd.DataFrame({'name': name_list})

# Combine all parts into one DataFrame
df = pd.concat([name_df, config_df, summary_df], axis=1)
# remove the columns that are repeated
df['preview_duration'] = df['preview_duration'].astype('float')/60
df = df.loc[:, ~df.columns.duplicated()]
# change the name of the metrics and remove metric_ from the name
df = df.rename(columns=lambda x: x.replace("metric_", ""))
# Display the DataFrame
print(df.columns)

# print the values of the dataset (unique values)
print(df['dataset'].unique())





Index(['name', 'seed', 'tags', 'dataset', 'epsilon', 'wandb_log',
       'base_learner', 'online_model', 'learning_model', '_step', '_timestamp',
       'average_ahead_records', '_wandb', 'R2', 'RMSE', 'preview_duration',
       'TMI', '_runtime', 'MAE', 'online_models', 'average_records_per_trip'],
      dtype='object')
['Teconer_Downtown_10K' 'Teconer_Jan_100K' 'Teconer_Jan_1M'
 'Teconer_Downtown_100K' 'Teconer_Jan_10K' 'Teconer_Downtown_1M']


## MAE, RMSE, R2 Barplots

In [127]:
%matplotlib qt
plt.close('all')


datasets = list(df['dataset'].unique())

for dataset in datasets:
    df_ = df[(df['dataset'] == dataset)]
    f, axs = plt.subplots(1, 3, figsize=(11, 4))
    for i, metric in enumerate(['R2', 'RMSE', 'MAE']):
        # plot barplot of R2 scores for different epsilon values and different preview durations
        sns.set_theme(style="whitegrid")
        # Try using .loc[row_indexer,col_indexer] = value instead
        sns.barplot(data=df_, x="epsilon", y=metric, hue="preview_duration", errorbar="sd", palette="coolwarm", ax=axs[i])
        axs[i].set_xlabel("epsilon")
        axs[i].set_ylabel(metric)
        # rename legend title
        if i == 1:
            axs[i].legend(title="Preview [min]", ncol=df_['preview_duration'].nunique(), loc='upper center', bbox_to_anchor=(0.5, 1.3))
        else:
            axs[i].get_legend().remove()
    # f.set_size_inches(12, 5)
    # f.suptitle(dataset)
    # increase the space between the plots
    plt.subplots_adjust(wspace=0.5, hspace=0.5, top=0.8, bottom=0.2)
    # plt.tight_layout()
    # save to \plots folder
    plt.savefig(f"teconer_metrics_{dataset}.svg")
    
plt.show()





In [134]:
%matplotlib qt


# print average_ahead_records for each dataset, and preview_duration in a table
df_ = df.groupby(['dataset', 'preview_duration'])['average_ahead_records'].mean().reset_index()
df_ = df_.pivot(index='dataset', columns='preview_duration', values='average_ahead_records')
df_ = df_.fillna(0)
# df_ = df_.sort_values(by='dataset', ascending=False)
# sort df_ based on the dataset names (given in the list)
df_ = df_.reindex(['Teconer_Jan_10K', 'Teconer_Jan_100K', 'Teconer_Jan_1M','Teconer_Downtown_10K', 'Teconer_Downtown_100K', 'Teconer_Downtown_1M'])
df_ = df_.round(2)
sns.heatmap(df_, annot=True, cmap="coolwarm", fmt=".2f", cbar=False)
plt.title('Average number of queried preview points')
plt.xlabel('Preview [min]')
plt.ylabel('Dataset')
plt.tight_layout()
plt.show()



In [147]:
# make a column 'Downtown' and 'Jan 2018' based on the dataset name
df['location'] = df['dataset'].apply(lambda x: 'Downtown' if 'Downtown' in x else 'Jan 2018')

# also make a column '10K', '100K', '1M' based on the dataset name
df['size'] = df['dataset'].apply(lambda x: '10K' if '10K' in x else ('100K' if '100K' in x else '1M'))

# plot a heatmap of average trip duration for different size values and location
df_ = df.groupby(['location', 'size'])['average_records_per_trip'].mean().reset_index()
df_ = df_.pivot(index='location', columns='size', values='average_records_per_trip')
df_ = df_.fillna(0)
df_ = df_.sort_values(by='location', ascending=False)
df_ = df_.round(2)
sns.heatmap(df_, annot=True, cmap="coolwarm", fmt=".2f", cbar=False)
plt.title('Average records per trip')
plt.xlabel('Number of Samples in Dataset')
plt.ylabel('')
plt.tight_layout()
plt.show()

# print the values in a table
print(df_)



size       100K    10K      1M
location                      
Jan 2018  95.88  11.25  956.94
Downtown  20.28   2.93  200.46
