Debug Coverage
===


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import json
import sys
import pickle
from tqdm import tqdm

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.TestConfig()

In [None]:
with open(os.path.join(config.coverage_stats_dir, 'baseline_predict.pkl'), 'rb') as infile:
    site_id_stat_maps = pickle.load(infile)
len(site_id_stat_maps)

In [None]:
baselines = site_id_stat_maps.keys()
baseline_df_dict = {}
for baseline in tqdm(baselines):
    stats_list = []
    for site_id, stat_map in site_id_stat_maps[baseline].items():
        stat_map['site_id'] = site_id
        stats_list.append(stat_map)
    bdf = pd.DataFrame(stats_list)
    baseline_df_dict[baseline] = bdf
    print(baseline, len(bdf))
len(baseline_df_dict)

In [None]:
baseline_df_dict['CosineSimilarity'].head()

In [None]:
baseline_df_dict['ClosestToStart'].head()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.ravel()):
    ax.set_title(baseline)
    ax.set_xlabel("Total Times Scored in Test Period")
    bdf = baseline_df_dict[baseline]
    x = bdf.n
    ax.hist(x, bins=20)
    ax.axvline(np.median(x), label=f'Median ({np.median(x):.1f})', color='black', linestyle='--', alpha=0.9)
    ax.legend()
    
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.ravel()):
    ax.set_title(baseline)
    ax.set_xlabel("Mean Rank")
    bdf = baseline_df_dict[baseline]
    x = bdf['mean']
    ax.hist(x, bins=20)
    ax.axvline(np.median(x), label=f'Median ({np.median(x):.1f})', color='black', linestyle='--', alpha=0.9)
    ax.legend()
    
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.ravel()):
    ax.set_title(baseline)
    ax.set_xlabel("Rank Variance")
    bdf = baseline_df_dict[baseline]
    x = bdf['var']
    ax.hist(x, bins=20)
    ax.axvline(np.median(x), label=f'Median ({np.median(x):.1f})', color='black', linestyle='--', alpha=0.9)
    ax.legend()
    
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.ravel()):
    ax.set_title(baseline)
    ax.set_xlabel("Best Site Ranks")
    ax.set_ylabel("Site Count")
    bdf = baseline_df_dict[baseline]
    
    counts = []
    n = np.sum(bdf.n > 0)
    for n_val in [5, 100, 1000]:
        count = np.sum(bdf['n_top_' + str(n_val)] > 0)
        counts.append(count)
    counts.append(n)
    counts = np.array(counts)
    xs = range(len(counts))
    ax.bar(xs, counts)
    ax.set_xticks(xs)
    ax.set_xticklabels(["Top 5", "Top 100", "Top 1K", "Any"])
    
    for x, count in zip(xs, counts):
        pct = count / n
        if pct == 1.0:
            continue
        ax.text(x, count, f"{pct*100:.2f}%", va='bottom', ha='center')
    
    #x = bdf['n_top_5']
    #ax.hist(x, bins=20)
    #ax.axvline(np.median(x), label=f'Median ({np.median(x):.1f})', color='black', linestyle='--', alpha=0.9)
    #ax.legend()
    
plt.show()

In [None]:
# plot the relationship between best rank and worst rank
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.ravel()):
    bdf = baseline_df_dict[baseline]
    x = bdf.best
    y = bdf.worst
    
    bins = ax.hexbin(x, y, gridsize=15, bins='log', mincnt=5)
    ax.set_title(baseline)
    ax.set_xlabel("Best Rank")
    ax.set_ylabel("Worst Rank")
    
    axins = ax.inset_axes([0.0, 0.0, 1.0, 0.2], alpha=0.0)
    axins.hist(x, bins=15, color=matplotlib.cm.viridis(0.5), alpha=0.9)
    axins.get_xaxis().set_visible(False)
    axins.get_yaxis().set_visible(False)
    axins.patch.set_alpha(0.5)    
    
plt.show()

In [None]:
# plot the relationship between best rank and worst rank
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.ravel()):
    bdf = baseline_df_dict[baseline]
    x = bdf.best
    y = bdf.worst
    
    ax.hist2d(x, y, bins=15)
    ax.set_title(baseline)
    ax.set_xlabel("Best Rank")
    ax.set_ylabel("Worst Rank")
    
    axins = ax.inset_axes([0.0, 0.0, 1.0, 0.2], alpha=0.5)
    axins.hist(x, bins=15, color='black', alpha=0.5)
    axins.get_xaxis().set_visible(False)
    axins.get_yaxis().set_visible(False)
    axins.patch.set_alpha(0.5)
    
    axins = ax.inset_axes([0.0, 0.0, 0.1, 1.0], alpha=0.5)
    axins.hist(x, bins=15, color='black', alpha=0.5, orientation='horizontal')
    axins.get_xaxis().set_visible(False)
    axins.get_yaxis().set_visible(False)
    axins.patch.set_alpha(0.5)
    
    
plt.show()