In [None]:
# ! pip install --upgrade mlflow

In [None]:
import typing as T
import os
import datetime
import json

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
from scipy.stats import linregress, kendalltau, spearmanr
from scipy.sparse import csr
import mlflow

from power_ratings import pm_training as pmt

In [None]:
# ! pip install nb-black
# %load_ext nb_black

In [None]:
mlflow.set_experiment("feature-generation-2023b")
mlflow.sklearn.autolog()

In [None]:
# read in game data, team names, and previously-calculated Elo scores for comparison
# seasons = [2020, 2021, 2022, 2023]
seasons = None


def generate_all_season_features(prefix: str, seasons: T.Optional[T.List[int]], starting_daynum: int=0):
    games_df, elo_df, teamnames = pmt.read_in_data(prefix, seasons=seasons, starting_daynum=starting_daynum)
    print(f"Max DayNum {games_df[games_df.Season == games_df.Season.max()].DayNum.max()}")
    if prefix == "M":
        pre_scaler = pmt.M_PRE_SCALER
        pre_base = pmt.M_PRE_BASE
    else:
        pre_scaler = pmt.W_PRE_SCALER
        pre_base = pmt.W_PRE_BASE
    ratings_df = pmt.train_model_all_years(
        games_df,
        pre_scaler=pre_scaler,
        pre_base=pre_base,
        seasons=seasons,
    )
    output, joined = pmt.join_datasets(ratings_df, elo_df, teamnames)

    output.to_csv(f"data/{prefix}_data_interim.csv", index=False)
    joined.to_csv(f"data/{prefix}_features_interim.csv", index=False)

    df_for_eff = pmt.get_df_for_eff(prefix)
    new_features = pmt.get_full_features(
        df_for_eff, output.set_index(["Season", "TeamID"]), prefix
    )
    new_features = new_features[[
        "Season",
        "TeamName",
        "TeamID",
        "WP16",
        "CombinedRating",
        "OffensiveRating",
        "DefensiveRating",
        "EloWithScore",
        "EloWinLoss",
        "EloDelta21Days",
        "PossessionEfficiencyFactor",
        "TempoEstimate",
        "ScoreVariance",
                    "EloDay30WithScore",
            "EloDay30WinLoss",
    ]]
    new_features.to_csv(f"output/{prefix}_data_complete.csv", index=False)
    mlflow.log_artifact(f"output/{prefix}_data_complete.csv")
    return new_features

In [None]:
m = generate_all_season_features("M", seasons=seasons)
w = generate_all_season_features("W", seasons=seasons)

In [None]:
m.sort_values("WP16", ascending=False).head(20)

In [None]:
w.sort_values("WP16", ascending=False).head(20)

In [None]:
with open("output/build_data.json", 'w') as f:
    json.dump({"build_date": datetime.date.today().strftime('%Y-%m-%d'), "data_date":" - 2023 Season - Day 132"}, f)

In [None]:
print(kendalltau(m.OffensiveRating, m.DefensiveRating))
print(spearmanr(m.OffensiveRating, m.DefensiveRating))
print(linregress(m.OffensiveRating, m.DefensiveRating).rvalue ** 2)

In [None]:
import os

os.listdir("output")

In [None]:
import jupyter_black

jupyter_black.load(lab=False)

pr = "W"
orig_df = pd.read_csv(f"output/{pr}_data_complete.csv")
team_df = pd.read_csv(f"data/{pr}Teams.csv", usecols=["TeamName", "TeamID"])
joined = pd.merge(orig_df, team_df, how='inner', on=['TeamName'])
print(joined.shape, orig_df.shape)

In [None]:
joined.to_csv(f"output/{pr}_data_complete.csv", index=False)

In [None]:
import jupyter_black

jupyter_black.load(lab=False)

pr = "M"
orig_df = pd.read_csv(f"output/{pr}_data_complete.csv")

In [None]:
from power_ratings.tournament_dataset import probabilistic_estimate_df
from power_ratings.pm_training import W_PRE_BASE, W_PRE_SCALER, M_PRE_BASE, M_PRE_SCALER

orig_df

In [None]:
# group by year and get the nth best statline


In [None]:
# probabilistic_estimate_df(
#     df_for_combined_rating, base=W_PRE_BASE, scaler=W_PRE_SCALER
# )

In [None]:
from scipy.stats import spearmanr, pearsonr

In [None]:
df_for_combined_rating["crank"] = df_for_combined_rating.groupby("Season")[
    "CombinedRating"
].rank(ascending=False)

In [None]:
top_teams = df_for_combined_rating[df_for_combined_rating.crank < 64]

In [None]:
pearsonr(
    top_teams.CombinedRating.values,
    top_teams.WinProbAgainstGoodTeam.values,
)

In [None]:
top_teams.sort_values("WinProbAgainstGoodTeam", ascending=False).head(20)

In [None]:
top_teams[top_teams.Season == 2023].sort_values(
    "WinProbAgainstGoodTeam", ascending=False
).head(49).drop(columns=[i for i in top_teams.columns if "T2" in i]).reset_index()

In [None]:
top_teams.groupby("Season").std()["WinProbAgainstGoodTeam"].sort_values()

In [None]:
import plotly.express as px

px.scatter(df_for_combined_rating, x="WinProbAgainstGoodTeam", y="CombinedRating",
           color="T1OffensiveRating", hover_data=df_for_combined_rating.columns)

In [None]:
! python -m pip install pyreadr

In [None]:
import pyreadr

result = pyreadr.read_r("./data/team_box_2023.rds")

In [None]:
result[None].columns

In [None]:
with pm.Model() as m:
    u = pm.Uniform('hi', lower=0, upper=3)