# About 
This notebook shows the process of converting a feature table (incl odd column) to the desired submission format, including:
- column header
- concatted ID format
- optional warning if the length is not 9122 (as required by kaggle challenge)

***
# Preface
## Imports

In [1]:
import os
os.chdir("/home/jovyan/work")

from IPython.core.display import HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline

from src.transformer import HistWinLossTransformer, OddTransformer, SeedTransformer, WinLossTransformer
from src.utils import format_submission, get_table

***
# Get some data
Process some data to show how the submission formatter works. Example taken from [0.6-jgoerner-odds-seeds](./0.6-jgoerner-odds-seeds.ipynb).

In [4]:
# get data
df_ncaa = get_table("t_original_ncaa_tourney_compact_results")[["season", "w_team_id", "l_team_id"]]

# derive wins & losses
wltrans_hist = HistWinLossTransformer(
    "w_team_id",
    "l_team_id",
)

# derive odds
otrans_hist = OddTransformer(
    ["wins_hist_a"],
    ["losses_hist_a"],
    ["wins_hist_b"],
    ["losses_hist_b"],
    n_samples=100,
    n_experiments=100,
)

# derive seeds
strans = SeedTransformer("w_team_id", "l_team_id")

# chain transformer
ppl = Pipeline([
    ("win_loss_transformer", wltrans_hist),
    ("odd_transfomer", otrans_hist),
    ("seed_transformer", strans)
])

# enhance data
df_ncaa_hist_odds_seed = ppl.fit_transform(df_ncaa)

# derive seed diff & seed diff absolute
df_ncaa_hist_odds_seed["seed_diff"]  = df_ncaa_hist_odds_seed["seed_rank_b"] - df_ncaa_hist_odds_seed["seed_rank_a"]
df_ncaa_hist_odds_seed["seed_diff_abs"] = np.abs(df_ncaa_hist_odds_seed["seed_diff"]) 

# dataframe preview
df_ncaa_hist_odds_seed.head()

Unnamed: 0,season,w_team_id,l_team_id,wins_hist_a,wins_hist_b,losses_hist_a,losses_hist_b,odds_a,seed_rank_a,seed_region_a,seed_rank_b,seed_region_b,seed_diff,seed_diff_abs
0,1985,1116,1234,30,16,18,17,0.8926,9,X,8,X,-1,1
1,1985,1120,1345,12,28,7,24,0.7418,11,Z,6,Z,-5,5
2,1985,1207,1250,33,1,21,5,0.9732,1,W,16,W,15,15
3,1985,1229,1425,2,9,4,12,0.3671,9,Y,8,Y,-1,1
4,1985,1242,1325,80,3,30,5,0.9774,3,Z,14,Z,11,11


***
# Format the data

In [11]:
# specify needed columns
df_submission_ready = \
format_submission(
    X=df_ncaa_hist_odds_seed,
    col_season="season",
    col_team_id_w="w_team_id",
    col_team_id_l="l_team_id",
    col_odds_w="odds_a",
    warning=False
)

df_submission_ready.head(10)

Unnamed: 0,id,pred,odds,invert
0,1985_1116_1234,0.8926,0.8926,0
1,1985_1120_1345,0.7418,0.7418,0
2,1985_1207_1250,0.9732,0.9732,0
3,1985_1229_1425,0.3671,0.3671,0
4,1985_1242_1325,0.9774,0.9774,0
5,1985_1246_1449,0.9878,0.9878,0
6,1985_1256_1338,0.3747,0.3747,0
7,1985_1233_1260,0.0106,0.9894,1
8,1985_1292_1314,0.0035,0.9965,1
9,1985_1323_1333,0.9743,0.9743,0


In [10]:
def format_submission(X, col_season, col_team_id_w, col_team_id_l, col_odds_w, out=None, warning=True):
    """TODO: Docstring here"""
    # create skeleton df
    df_result = pd.DataFrame(columns=["id", "pred"])
    
    # fill id
    df_result["id"] = \
        X[col_season].astype(str) \
        + "_" \
        + X[[col_team_id_w, col_team_id_l]].min(axis=1).astype(str) \
        + "_" \
        + X[[col_team_id_w, col_team_id_l]].max(axis=1).astype(str)
    
    # get odds
    df_result["odds"] = X[col_odds_w]
    
    # invert if loser team id is smaller than winner team id
    df_result["invert"] = (X[col_team_id_l] < X[col_team_id_w]).astype(int)
    
    # "flip" the odds where neccessary
    df_result["pred"] = (df_result["odds"] - df_result["invert"]).apply(np.abs)
    
    # check kaggle specific
    if warning & (df_result.shape[0] != 9112):
        warnings.warn("Dataframe has {} records, but submission needs 9112 ".format(df_result.shape[0]))
    
    # return clean format
    return df_result#.drop(["odds", "invert"], axis=1)