# About 
This notebook shows the process of converting a feature table (incl odd column) to the desired submission format, including:
- column header
- concatted ID format
- (optionally) warning if the length is not correct

***
# Preface
## Imports

In [1]:
import os
os.chdir("/home/jovyan/work")

from IPython.core.display import HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline

from src.transformer import HistWinLossTransformer, OddTransformer, SeedTransformer, WinLossTransformer
from src.utils import format_submission, get_table

***
# Get some data
Process some data to show how the submission formatter works. Example taken from [0.6-jgoerner-odds-seeds](./0.6-jgoerner-odds-seeds.ipynb).

In [2]:
# get data
df_ncaa = get_table("t_original_ncaa_tourney_compact_results")[["season", "w_team_id", "l_team_id"]]

# derive wins & losses
wltrans_hist = HistWinLossTransformer(
    "w_team_id",
    "l_team_id",
)

# derive odds
otrans_hist = OddTransformer(
    ["wins_hist_a"],
    ["losses_hist_a"],
    ["wins_hist_b"],
    ["losses_hist_b"],
    n_samples=100,
    n_experiments=100,
)

# derive seeds
strans = SeedTransformer("w_team_id", "l_team_id")

# chain transformer
ppl = Pipeline([
    ("win_loss_transformer", wltrans_hist),
    ("odd_transfomer", otrans_hist),
    ("seed_transformer", strans)
])

# enhance data
df_ncaa_hist_odds_seed = ppl.fit_transform(df_ncaa)

# derive seed diff & seed diff absolute
df_ncaa_hist_odds_seed["seed_diff"]  = df_ncaa_hist_odds_seed["seed_rank_b"] - df_ncaa_hist_odds_seed["seed_rank_a"]
df_ncaa_hist_odds_seed["seed_diff_abs"] = np.abs(df_ncaa_hist_odds_seed["seed_diff"]) 

# dataframe preview
df_ncaa_hist_odds_seed.head()

Unnamed: 0,season,w_team_id,l_team_id,wins_hist_a,wins_hist_b,losses_hist_a,losses_hist_b,odds_a,seed_rank_a,seed_region_a,seed_rank_b,seed_region_b,seed_diff,seed_diff_abs
0,1985,1116,1234,30,16,18,17,0.8887,9,X,8,X,-1,1
1,1985,1120,1345,12,28,7,24,0.741,11,Z,6,Z,-5,5
2,1985,1207,1250,33,1,21,5,0.9768,1,W,16,W,15,15
3,1985,1229,1425,2,9,4,12,0.363,9,Y,8,Y,-1,1
4,1985,1242,1325,80,3,30,5,0.9811,3,Z,14,Z,11,11


# Format the data

In [5]:
# specify needed columns, inor
df_submission_ready = \
format_submission(
    X=df_ncaa_hist_odds_seed,
    col_season="season",
    col_team_id_w="w_team_id",
    col_team_id_l="l_team_id",
    col_odds_w="odds_a",
    warning=False
)

df_submission_ready.head(10)

Unnamed: 0,id,pred
0,1985_1116_1234,0.8887
1,1985_1120_1345,0.741
2,1985_1207_1250,0.9768
3,1985_1229_1425,0.363
4,1985_1242_1325,0.9811
5,1985_1246_1449,0.9896
6,1985_1256_1338,0.3825
7,1985_1233_1260,0.0119
8,1985_1292_1314,0.003
9,1985_1323_1333,0.9755
