# Data Quality Checks

In [None]:
from airflow.models import Variable
import pandas as pd
import sqlalchemy
from dotenv import load_dotenv

load_dotenv();

In [None]:
POSTGRES_USER = Variable.get("POSTGRES_USER")
POSTGRES_PW = Variable.get("POSTGRES_PW")
POSTGRES_HOST = Variable.get("POSTGRES_HOST")
MSF_API_KEY = Variable.get("MSF_API_KEY")
postgres_connection_str = f'postgres+psycopg2://{POSTGRES_USER}:{POSTGRES_PW}@{POSTGRES_HOST}/nba'

engine = sqlalchemy.create_engine(postgres_connection_str)

## Games

### Check that there are the correct number of games

In [None]:
with engine.connect() as conn:
    sqlq = """
        select
            season
            , schedulestatus
            , playedstatus
            , count(*)
            , max(starttime)
            , min(starttime)
        from games as gm
        group by 1, 2, 3
        order by 1, 2, 3
    """
    season_games_count = pd.read_sql(sqlq, conn)

In [None]:
season_games_count

## Game logs

### Check for any missing games amongst game logs

In [None]:
with engine.connect() as conn:
    sqlq = """
        select
            season, count(*)
        from games as gm
        where
            not exists (
                select * from player_gamelogs as gl
                where gl.game_id = gm.id
            )
            and gm.playedStatus = 'COMPLETED'
        group by 1
        order by 1
    """
    missing_games_count = pd.read_sql(sqlq, conn)

In [None]:
missing_games_count

In [None]:
with engine.connect() as conn:
    sqlq = """
        select
            *
        from games as gm
        where
            not exists (
                select * from player_gamelogs as gl
                where gl.game_id = gm.id
            )
            and gm.playedStatus = 'COMPLETED'
    """
    missing_games = pd.read_sql(sqlq, conn)

In [None]:
missing_games

### Check for mismatches between player and boxscore stats

In [None]:
with engine.connect() as conn:
    sqlq = """
        with gamelog_totals as (
            select
                game_id
                , team_id
                , sum(stats_offense_pts) as pts
            from player_gamelogs
            group by 1, 2
        )
        select
              gm.id
            , gm.awayteam_id
            , gm.hometeam_id
            , gm.awayscoretotal
            , gm.homescoretotal
            , glh.pts as home_pts
            , gla.pts as away_pts
        from games as gm
        left join gamelog_totals as glh
            on glh.game_id = gm.id
            and glh.team_id = gm.hometeam_id
        left join gamelog_totals as gla
            on gla.game_id = gm.id
            and gla.team_id = gm.awayteam_id
        where gm.playedStatus = 'COMPLETED'
    """
    pts_totals = pd.read_sql(sqlq, conn)

In [None]:
pts_totals.loc[lambda x: (x["home_pts"] != x["homescoretotal"]) | (x["away_pts"] != x["awayscoretotal"])]

### Check for correct number of starter

In [None]:
# TODO

## Lineups

### Check for missing game lineups

#### Actual

In [None]:
with engine.connect() as conn:
    sqlq = """
        select
            season, count(*)
        from games as gm
        where
            not exists (
                select * from lineups as l
                where
                    l.game_id = gm.id
                    and l.type = 'actual'
            )
            and gm.playedStatus = 'COMPLETED'
        group by 1 
        order by 1
    """
    missing_lineups_count = pd.read_sql(sqlq, conn)

In [None]:
missing_lineups_count

In [None]:
with engine.connect() as conn:
    sqlq = """
        select
            *
        from games as gm
        where
            not exists (
                select * from lineups as l
                where
                    l.game_id = gm.id
                    and l.type = 'actual'
            )
            and gm.playedStatus = 'COMPLETED'
    """
    missing_lineups = pd.read_sql(sqlq, conn)

In [None]:
missing_lineups

#### Expected

In [None]:
with engine.connect() as conn:
    sqlq = """
        select
            season, count(*)
        from games as gm
        where
            not exists (
                select * from lineups as l
                where
                    l.game_id = gm.id
                    and l.type = 'expected'
            )
            and gm.playedStatus = 'COMPLETED'
        group by 1
        order by 1
    """
    missing_expected_lineups_count = pd.read_sql(sqlq, conn)

In [None]:
missing_expected_lineups_count

In [None]:
missing_expected_lineups_count

In [None]:
with engine.connect() as conn:
    sqlq = """
        select
            *
        from games as gm
        where
            not exists (
                select * from lineups as l
                where
                    l.game_id = gm.id
                    and l.type = 'expected'
            )
            and gm.playedStatus = 'COMPLETED'
    """
    missing_expected_lineups = pd.read_sql(sqlq, conn)

In [None]:
missing_expected_lineups

### Check for correct number of starters

In [None]:
# TODO

## DFS