In [1]:
import os
import datetime
import unidecode
import itertools
import functools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.style as style


%matplotlib inline

In [2]:
plt.style.use('fivethirtyeight')
pd.options.display.max_rows = 150

In [3]:
from filing import Filing
from scraper._dates import PLAYOFF_DATES

In [14]:
from collections.abc import Sequence
from typing import Any

def clean_name(name: str) -> str:
    """
    Standardizes name across Basketball-Reference, FD, DK
    """
    clean = ' '.join(name.split(' ')[:2]).replace('.', '').strip()
    # return standardize_name(clean)
    return clean


def flatten(seq: Sequence[Sequence[Any,...],...]) -> list[Any,...]:
    return [element for inner_seq in seq for element in inner_seq]

def percentile(n: int) -> float:
    """
    Calculates n% outcome for players, designed for use in .agg
    Example: df.groupby('name')['fpts'].agg([percentile(0), percentile(50), percentile(100)])
         --> Returns 3 columns, indexed by name corresponding to following outcomes: 0% (minimum), 50% (median), 100% (maxium)
         --> Common usage will be 25% which roughly corresponds to floor and 75% which roughly corresponds to ceiling
    """
    def percentile_(arr):
        return np.percentile(arr, n)
    # percentile_.__name__ = f'percentile_{n}'
    label = {25: 'floor', 50: 'median', 75: 'ceiling'}.get(n, f'{n}%')
    percentile_.__name__ = label
    return percentile_

def valuerange(arr):
    """
    Returns the range of values, max-min, to demonstrate spread of possible outcomes
    For small samples, more indicative of spread than std which can often be confusing
    """

    valuerange.__name__ = 'range'
    return np.max(arr) - np.min(arr)



def product(a, b):
    """
    Returns product of two values, either scalars (single values) or vectors (series, np arrays)
    To be used in conjunction with functools.reduce()
    Enables multiplaction of values of unknown length
    Example:
        - functools.reduce(product, [1,2,3,4]) = 24
        - functools.reduce(product, [df[col] for col in [c1,c2,c3]]) = df[c1]*df[c2]*df[c3]
    """
    return a*b

In [26]:
class SeasonData:
    def __init__(self, year=2023, **kwargs):
        self.year = year
        self.season = f'{year}-{year+1}'
        
        self.site = kwargs.get('site', 'draftkings')
        
        self.filing = Filing(self.season, site=self.site)

        
        if year == 2023:
            team_col = 'Team' if self.site == 'fanduel' else 'TeamAbbrev'
            name_col = 'Nickname' if self.site == 'fanduel' else 'Name'
            file = f'{datetime.date.today().isoformat()}.csv' 
            hour = int(datetime.datetime.now().strftime('%H'))
            if hour >= 20 or kwargs.get('late', False):
                file = file.replace('.csv', '-late.csv')

            self.current_df = pd.read_csv(os.path.join(self.filing.season_dir, 'contest-files', self.site, 'main-slate', file))
            self.current_df[name_col] = self.current_df[name_col].map(lambda name_: clean_name(name_))
            self.CURRENT_TEAMS = tuple(self.current_df
                                       [team_col]
                                       .drop_duplicates()
                                      )

            team_issues = {
                'GSW': 'GS',
                'PHX': 'PHO',
                'NYK': 'NY',
                'NOP': 'NO',
                'SAS': 'SA'
            }

            self.CURRENT_TEAMS = tuple([team_issues.get(team, team) for team in self.CURRENT_TEAMS])
            self.SALARIES = {
                name: int(self.current_df
                          .loc[self.current_df[name_col] == name]
                          .drop_duplicates()
                          ['Salary']
                          .item()
                         )
                for name in self.current_df[name_col].drop_duplicates()
            }
    
    def load(self, **kwargs):

        if hasattr(self, 'clean'):
            return self.clean


        name_col = 'Name' if self.site == 'draftkings' else 'Nickname'
        
        raw = self.filing.load_boxscores()

        if self.year == 2023:
            positions = (self.filing.load_contests()
                         [[name_col, 'Position']]
                         .drop_duplicates(subset=name_col)
                         .set_axis(['name', 'pos'], axis=1)
                         .set_index('name')
                        )
    
            pos_lookup = {name: positions.loc[name, 'pos'] for name in positions.index}


        stats = [
            'date',
            'name',
            # 'pos', # **
            'starter',
            'team',
            'opp',

            'mp',
            'fpts', # ** Originally has both DraftKings and FanDuel columns and selects one based 
            'fppm', # **
            
            'pts',
            'ast',
            'trb',
            'stl',
            'blk',
            'tov',
            
            'usg',
            'ts',
            'ast_pct',
            'trb_pct',
            'pace',

            'spread',
            'total'

            # ** indicates columns added and not in original data
        ]

        if self.year == 2023:
            stats = ['pos'] + stats

        stats = sum([
            stats,
            [stat_ for stat_ in kwargs.get('stats', []) if stat_ not in stats]
        ], list())

        if self.season != '2023-2024':
            playoffs_start, playoffs_end = PLAYOFF_DATES[self.season]
            playoffs_dates = [ date_.strftime('%Y-%m-%d') for date_ in pd.date_range(playoffs_start, playoffs_end) ]

            raw['playoffs'] = raw['date'].map(lambda date_: int(date_ in playoffs_dates)).astype('uint8')

            stats += ['playoffs']
        
        
        # Clean up name
        raw['name'] = raw['name'].map(lambda name_: unidecode.unidecode(name_))

        if self.year == 2023:
            raw['pos'] = raw['name'].map(lambda name_: pos_lookup.get(name_, 'SG/SF'))

        int_cols = ['pts', 'ast', 'trb', 'stl', 'blk', 'tov', 'fga', 'fta', 'orb', 'plus_minus', 'pf']

        # Extra step to account for added stats
        convert_to_int_cols = set(stats).intersection(int_cols)

        self.clean = (raw
                      .assign(
                          # usg=lambda df_: df_.usg / 100,
                          ts=lambda df_: df_.ts * 100,
                          fpts=lambda df_: df_.dk_fpts if self.site == 'draftkings' else df_.fd_fpts,
                          fppm=lambda df_: df_.fpts / df_.mp,
                          **{c: lambda df_, c=c:df_[c].astype('int') for c in convert_to_int_cols } #uint8
                      )
                      .pipe(lambda df_: df_.loc[df_['mp'] > 0.0])
                      .sort_values(['date', 'team', 'fpts'], ascending=[True, True, False])
                      .round(3)
                      [stats]
                     )
        
        return self.clean

    def dates_without(self, without: str, team: str) -> tuple[str,...]:
        """
        Returns tuple of date strings for all dates player missed for team in season
        """
        tdf = (self.load()
               .pipe(lambda df_: df_.loc[df_['team'] == team])
              )

        # Dates team played
        team_dates = set(tdf['date'].drop_duplicates())
        
        # Dates that person without played
        name_dates = set(tdf.loc[tdf['name'] == without, 'date'].drop_duplicates())

        # Dates without player
        # return tuple(team_dates.difference(name_dates))
        return sum([
            tuple(team_dates.difference(name_dates)),
            tuple(tdf.loc[(tdf['name'] == without) & (tdf['mp'] < 6.0), 'date'].drop_duplicates())
        ], tuple())

    def performance_with(self, *names, team='placeholder', **kwargs) -> pd.DataFrame:
        """
        Returns dataframe of team performance when players are in (usually will be used with stars)
        """

        if team == 'placeholder':
            print('Need to add team\n')

        tdf = (self.load(**kwargs)
               .pipe(lambda df_: df_.loc[df_['team'] == team])
              )

        # Dates team played
        team_dates = set(tdf['date'].drop_duplicates())


        # Dates that each player in names played
        name_dates = {name: set(tdf.loc[tdf['name'] == name, 'date'].drop_duplicates()) for name in names}

        dates_names_all_played = name_dates[names[0]]

        if len(names) > 1:
            for name in names[1:]:
                dates_names_all_played = dates_names_all_played.intersection(name_dates[name])


        if 'without' in kwargs:
            dates_names_all_played = [date for date in dates_names_all_played if date in self.dates_without(kwargs['without'], team)]

        print(f'Sample size with {", ".join(names)}: {len(dates_names_all_played)} games.')

        stats = sum([
            ['fpts', 'mp', 'fppm'],
            kwargs.get('stats', list())
        ], list())

        if kwargs.get('raw', False):
            return (tdf
                    .loc[tdf['date'].isin(dates_names_all_played)]
                    .sort_values(['date', 'fpts', 'mp'], ascending=False)
                   )

        retdf= (tdf
                .loc[tdf['date'].isin(dates_names_all_played)]
                .groupby('name')
                [stats]
                .agg(['count', 'min', percentile(25), 'median', percentile(75), 'max'])
                # .pipe(lambda df_: df_.loc[(df_[('fpts', 'count')] >= 5) | (df_[('fpts', 'median')] > 30.0)])
                .sort_values(('fpts', 'median'), ascending=False)
                .drop([(stat_, 'count') for stat_ in stats if stat_ != stats[0]], axis=1)
                .fillna(0.0)
                .round(2)
               )
        
        defaultsal = 3_500 if self.site == 'fanduel' else 3_000
        retdf['salary'] = retdf.index.map(lambda name_: self.SALARIES.get(name_, defaultsal))
        retdf['fpts_1k'] = 1_000 * retdf[('fpts', 'median')] / retdf['salary']

        firstcol = retdf.pop('salary')
        secondcol = retdf.pop('fpts_1k')
        
        retdf.insert(0, 'salary', firstcol)
        retdf.insert(1, 'fpts_1k', secondcol)

        return retdf.round(2)

    def performance_with_starting(self, *names, team='placeholder', **kwargs) -> pd.DataFrame:
        """
        Similar to SeasonData.performance_with(), however instead of returning games where all players provided play, this returns
        games where all players provided started in an attempt to better resemble how rotations / minutes / fpts may be divided.

        If more than 5 players listed, cuts off so as to only do first 5
        """

        if team == 'placeholder':
            print('Need to add team\n')
            return pd.DataFrame()

        if len(names) > 5:
            print(f'More than 5 players input, only using {", ".join(names[:5])}\n')
            return self.performance_with_starting(*names[:5], team=team, **kwargs)
        
        tdf = (self
               .load(**kwargs).pipe(lambda df_: df_.loc[df_['team'] == team])
              )

        # Dates team played
        team_dates = set(tdf['date'].drop_duplicates())


        # Dates that each player in names played and started in --> doing this as opposed to groupby for now
        # Converting to set so as to perform set operations on dates in next step
        name_start_dates = {name: set(tdf.loc[(tdf['name'] == name) & (tdf['starter'] == 1), 'date'].drop_duplicates()) for name in names}

        dates_names_all_started = name_start_dates[names[0]]

        if len(names) > 1:
            for name in names[1:]:
                dates_names_all_started = dates_names_all_started.intersection(name_start_dates[name])


        if 'without' in kwargs:
            dates_names_all_started = [date for date in dates_names_all_started if date in self.dates_without(kwargs['without'], team)]

        print(f'Sample size with {", ".join(names)} all starting: {len(dates_names_all_started)} games.')

        stats = sum([
            ['fpts', 'mp', 'fppm'],
            kwargs.get('stats', list())
        ], list())

        if kwargs.get('raw', False):
            return (tdf
                    .loc[tdf['date'].isin(dates_names_all_started)]
                    .sort_values(['date', 'fpts', 'mp'], ascending=False)
                   )

        retdf= (tdf
                .loc[tdf['date'].isin(dates_names_all_started)]
                .groupby('name')
                [stats]
                .agg(['count', 'min', percentile(25), 'median', percentile(75), 'max'])
                # .pipe(lambda df_: df_.loc[(df_[('fpts', 'count')] >= 5) | (df_[('fpts', 'median')] > 30.0)])
                .sort_values(('fpts', 'median'), ascending=False)
                .drop([(stat_, 'count') for stat_ in stats if stat_ != stats[0]], axis=1)
                .fillna(0.0)
               )

        defaultsal = 3_500 if self.site == 'fanduel' else 3_000
        retdf['salary'] = retdf.index.map(lambda name_: self.SALARIES.get(name_, defaultsal))
        retdf['fpts_1k'] = 1_000 * retdf[('fpts', 'median')] / retdf['salary']

        firstcol = retdf.pop('salary')
        secondcol = retdf.pop('fpts_1k')
        
        retdf.insert(0, 'salary', firstcol)
        retdf.insert(1, 'fpts_1k', secondcol)

        return retdf.round(2)

    def performance_without_2(self, *names, team='placeholder', **kwargs) -> pd.DataFrame:
        """
        Returns dataframe of team performance when players are in (usually will be used with stars)
        """

        if team == 'placeholder':
            print('Need to add team\n')
            return pd.DataFrame()

        tdf = (self.load(**kwargs)
               .pipe(lambda df_: df_.loc[df_['team'] == team])
              )

        # Dates team played
        team_dates = set(tdf['date'].drop_duplicates())


        # Dates that each player in names played
        name_dates = {name: set(tdf.loc[tdf['name'] == name, 'date'].drop_duplicates()) for name in names}

        dates_names_missed = set(self.dates_without(names[0], team))

        if len(names) > 1:
            for name in names[1:]:
                dates_names_missed = dates_names_missed.intersection(set(self.dates_without(name, team)))


        print(f'Sample size with {", ".join(names)} missing: {len(dates_names_missed)} games.')

        stats = sum([
            ['fpts', 'mp', 'fppm'],
            kwargs.get('stats', list())
        ], list())

        if kwargs.get('raw', False):
            return (tdf
                    .loc[tdf['date'].isin(dates_names_missed)]
                    .sort_values(['date', 'fpts', 'mp'], ascending=False)
                   )

        retdf= (tdf
                .loc[tdf['date'].isin(dates_names_missed)]
                .groupby('name')
                [stats]
                .agg(['count', 'min', percentile(25), 'median', percentile(75), 'max'])
                # .pipe(lambda df_: df_.loc[(df_[('fpts', 'count')] >= 5) | (df_[('fpts', 'median')] > 30.0)])
                .sort_values(('fpts', 'median'), ascending=False)
                .drop([(stat_, 'count') for stat_ in stats if stat_ != stats[0]], axis=1)
                .fillna(0.0)
                .round(2)
               )

        defaultsal = 3_500 if self.site == 'fanduel' else 3_000
        retdf['salary'] = retdf.index.map(lambda name_: self.SALARIES.get(name_, defaultsal))
        retdf['fpts_1k'] = 1_000 * retdf[('fpts', 'median')] / retdf['salary']

        firstcol = retdf.pop('salary')
        secondcol = retdf.pop('fpts_1k')
        
        retdf.insert(0, 'salary', firstcol)
        retdf.insert(1, 'fpts_1k', secondcol)

        return retdf.round(2)

    def performance_without(self, without: str, team: str, *args, **kwargs) -> pd.DataFrame:
        """
        Returns dataframe of team performance without player
        """
        
        tdf = (self.load(**kwargs)
               .pipe(lambda df_: df_.loc[df_['team'] == team])
              )

        
        # Dates team played
        team_dates = set(tdf['date'].drop_duplicates())
        
        # Dates that person without played
        name_dates = set(tdf.loc[tdf['name'] == without, 'date'].drop_duplicates())


        # Dates without player
        wout_dates = self.dates_without(without, team) #tuple(team_dates.difference(name_dates))

        print(f'Sample size without {without}: {len(wout_dates)} games.')

        stats = sum([
            ['fpts', 'mp', 'fppm'],
            kwargs.get('stats', list())
        ], list())

        if kwargs.get('raw', False):
            return (tdf
                    .loc[tdf['date'].isin(wout_dates)]
                    .sort_values(['date', 'fpts', 'mp'], ascending=False)
                   )

        retdf= (tdf
                .loc[tdf['date'].isin(wout_dates)]
                .groupby('name')
                [stats]
                .agg(['count', 'min', percentile(25), 'median', percentile(75), 'max'])
                # .pipe(lambda df_: df_.loc[(df_[('fpts', 'count')] >= 5) | (df_[('fpts', 'median')] > 30.0)])
                .sort_values(('fpts', 'median'), ascending=False)
                .drop([(stat_, 'count') for stat_ in stats if stat_ != stats[0]], axis=1)
                .fillna(0.0)
                .round(2)
               )

        defaultsal = 3_500 if self.site == 'fanduel' else 3_000
        retdf['salary'] = retdf.index.map(lambda name_: self.SALARIES.get(name_, defaultsal))
        retdf['fpts_1k'] = 1_000 * retdf[('fpts', 'median')] / retdf['salary']

        firstcol = retdf.pop('salary')
        secondcol = retdf.pop('fpts_1k')
        
        retdf.insert(0, 'salary', firstcol)
        retdf.insert(1, 'fpts_1k', secondcol)

        return retdf.round(2)

    def performances_without_compared(self, without: str, team: str, *args, **kwargs) -> pd.DataFrame:
        """
        Similar to SeasonData.performance_without(without, team, *args, **kwargs)
            - Main difference is this compares samples to one another instead of deep analysis of just games without
            - Example: SeasonData.performances_without_compared('Donovan Mitchell', 'CLE') 
                -> Percent change in median outcomes for stats for Darius Garland, Evan Mobley, etc.
                -> Compares the two samples: Games players played with Mitchell and without
        
        """

        tdf = (self.load(**kwargs)
               .pipe(lambda df_: df_.loc[df_['team'] == team])
              )

        # Dates team played
        team_dates = set(tdf['date'].drop_duplicates())
        
        # Dates that person without played
        name_dates = set(tdf.loc[tdf['name'] == without, 'date'].drop_duplicates())


        # Dates without player
        wout_dates = self.dates_without(without, team) #tuple(team_dates.difference(name_dates))

        output_msgs = [
            f'Sample size with {without}: {len(name_dates)}',
            f'Sample size without {without}: {len(wout_dates)}',
        ]

        info_cols = ['date', 'name', 'team', 'opp']

        stats = sum([
            ['fpts', 'mp', 'fppm', 'usg'],
            kwargs.get('stats', list())
        ], list())

        with_df: pd.DataFrame = (tdf
                                 .loc[tdf['date'].isin(name_dates)]
                                 [info_cols + stats]
                                 .groupby('name')
                                 [stats]
                                 .agg('mean')
                                )

        wout_df: pd.DataFrame = (tdf
                                 .loc[tdf['date'].isin(wout_dates)]
                                 [info_cols + stats]
                                 .groupby('name')
                                 [stats]
                                 .agg('mean')
                                )

        # Making separate groupby so as not to mess up clean dataframe subtraction in return statement
        # Still want to get counts for each player though
        counts = (tdf
                  .loc[tdf['date'].isin(wout_dates)]
                  .groupby('name')
                  [['name']]
                  .agg('count')
                 )
        

        print(*output_msgs, sep='\n')

        ret_df = ((wout_df - with_df)
                  .dropna()
                  .round(2)
                 )

        ret_df['n-games'] = ret_df.index.map(lambda name: counts.loc[name, 'name'])

        return (ret_df
                .loc[:, ['n-games'] + stats]
                .sort_values('fpts', ascending=False)
               )

    

    def performance_against(self, opp: str, **kwargs) -> pd.DataFrame:
        """
        Returns outcomes of all players against certain team
        """

        return (self.load()
                .pipe(lambda df_: df_.loc[df_['opp'] == opp])
                .sort_values(['fpts', 'mp', 'fppm'], ascending=False)
               )
    

    def create_heatmap(self, **kwargs):

        

        # Make sure stats include in load() if want to add more
        stats = sum([
            ['fpts', 'mp', 'usg', 'ts', 'ast_pct', 'pts', 'pace'],
            [stat_ for stat_ in kwargs.get('stats', []) if stat_ not in stats]
        ], list())


        output = [
            f'Stats included: {", ".join(stats[:-1])} and {stats[-1]}',
        ]
        
        startervals = (0,1)
        
        if 'starter' in kwargs:
            startervals = (int(kwargs['starter']), )
            output.append('Considering stats for starters only.')

        # Conditional included to check if just starters or not
        df = (self.load()
              .pipe(lambda df_: df_.loc[df_['starter'].isin(startervals)])
              [stats]
             )

        output = [f'Sample size: {df.shape[0]:,}'] + output

        # Include product of stats
        if 'product' in kwargs:
            product_str = 'Product columns:'
            for cols in itertools.combinations(kwargs['product'], 2):
                df['*'.join(cols)] = df[cols[0]]*df[cols[1]]
                # stats.append(cols)
                product_str += f' {cols},'

            if 'combos' in kwargs:
                for n in range(3, kwargs['combos']+1):
                    for cols in itertools.combinations(kwargs['product'], n):
                        df['*'.join(cols)] = functools.reduce(product, [df[col_] for col_ in cols])
                        # stats.append(cols)
                        product_str += f' {cols},'
            output.append(product_str)

        # Correlations dataframe
        corr: pd.DataFrame = df.corr()
            
        # Better size
        fig, ax = plt.subplots(figsize=kwargs.get('figsize', (15,10)))
        mask: np.ndarray = np.triu(np.ones_like(corr, dtype=bool))
        
        # Preferred kwargs for heatmap that are not defaults
        sns_kwargs: dict[str,str|float] = {
            'cmap': kwargs.get('cmap', 'jet_r'),
            'vmin': kwargs.get('vmin', 0.5),
            'vmax': kwargs.get('vmax', 1.0)
            
        }

        print(*output, sep='\n')
        
        return sns.heatmap(
            corr, 
            mask=mask,
            **sns_kwargs
        ) 

In [27]:
SITE = 'draftkings'

In [28]:
szn2021 = SeasonData(year=2021, site=SITE)
szn2022 = SeasonData(year=2022, site=SITE)
szn2023 = SeasonData(year=2023, site=SITE, stats=['plus_minus'])

In [29]:
szn2023.performance_with_starting('Luke Kennard', 'Ziaire Williams',  'Vince Williams', 'Jaren Jackson', 'Xavier Tillman', team='MEM', stats=['usg'])

Sample size with Luke Kennard, Ziaire Williams, Vince Williams, Jaren Jackson, Xavier Tillman all starting: 1 games.


Unnamed: 0_level_0,salary,fpts_1k,fpts,fpts,fpts,fpts,fpts,fpts,mp,mp,mp,mp,mp,fppm,fppm,fppm,fppm,fppm,usg,usg,usg,usg,usg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Vince Williams,6100,7.34,1,44.75,44.75,44.75,44.75,44.75,37.87,37.87,37.87,37.87,37.87,1.18,1.18,1.18,1.18,1.18,20.4,20.4,20.4,20.4,20.4
Jaren Jackson,7600,5.62,1,42.75,42.75,42.75,42.75,42.75,33.3,33.3,33.3,33.3,33.3,1.28,1.28,1.28,1.28,1.28,30.8,30.8,30.8,30.8,30.8
Gregory Jackson,4000,10.38,1,41.5,41.5,41.5,41.5,41.5,29.17,29.17,29.17,29.17,29.17,1.42,1.42,1.42,1.42,1.42,20.9,20.9,20.9,20.9,20.9
Santi Aldama,4500,5.78,1,26.0,26.0,26.0,26.0,26.0,17.1,17.1,17.1,17.1,17.1,1.52,1.52,1.52,1.52,1.52,22.5,22.5,22.5,22.5,22.5
Jacob Gilyard,4200,6.07,1,25.5,25.5,25.5,25.5,25.5,19.12,19.12,19.12,19.12,19.12,1.33,1.33,1.33,1.33,1.33,15.3,15.3,15.3,15.3,15.3
David Roddy,4600,5.16,1,23.75,23.75,23.75,23.75,23.75,21.4,21.4,21.4,21.4,21.4,1.11,1.11,1.11,1.11,1.11,27.4,27.4,27.4,27.4,27.4
Luke Kennard,5300,4.43,1,23.5,23.5,23.5,23.5,23.5,31.93,31.93,31.93,31.93,31.93,0.74,0.74,0.74,0.74,0.74,16.0,16.0,16.0,16.0,16.0
Xavier Tillman,5500,3.18,1,17.5,17.5,17.5,17.5,17.5,30.28,30.28,30.28,30.28,30.28,0.58,0.58,0.58,0.58,0.58,8.3,8.3,8.3,8.3,8.3
Ziaire Williams,4400,2.22,1,9.75,9.75,9.75,9.75,9.75,19.8,19.8,19.8,19.8,19.8,0.49,0.49,0.49,0.49,0.49,18.5,18.5,18.5,18.5,18.5


In [30]:
szn2023.performance_with_starting('Mike Conley', 'Karl-Anthony Towns', 'Jaden McDaniels', 'Anthony Edwards', 'Rudy Gobert', team='MIN', stats=['usg'])

Sample size with Mike Conley, Karl-Anthony Towns, Jaden McDaniels, Anthony Edwards, Rudy Gobert all starting: 27 games.


Unnamed: 0_level_0,salary,fpts_1k,fpts,fpts,fpts,fpts,fpts,fpts,mp,mp,mp,mp,mp,fppm,fppm,fppm,fppm,fppm,usg,usg,usg,usg,usg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Anthony Edwards,8400,5.6,27,13.0,36.88,47.0,52.12,61.75,25.62,33.28,36.3,37.83,39.98,0.51,1.14,1.3,1.44,1.67,16.5,31.05,33.1,36.6,42.2
Karl-Anthony Towns,8000,5.0,27,21.0,32.5,40.0,43.38,70.5,27.17,30.03,31.87,35.35,41.68,0.74,1.01,1.22,1.37,2.02,18.2,23.0,26.8,30.4,40.1
Rudy Gobert,7400,4.49,27,12.75,28.12,33.25,40.62,48.75,23.9,28.69,33.57,36.1,40.75,0.53,0.84,1.11,1.15,1.71,8.8,13.6,15.6,17.65,24.8
Mike Conley,5900,3.98,27,14.0,20.62,23.5,29.12,36.75,20.4,25.89,28.83,30.9,36.23,0.5,0.69,0.85,1.04,1.5,6.1,11.85,14.1,15.85,18.9
Naz Reid,5000,4.3,27,8.0,15.25,21.5,25.62,43.5,14.38,18.62,21.03,24.28,31.4,0.42,0.81,1.0,1.17,1.38,14.7,19.05,22.9,25.05,33.7
Jaden McDaniels,5200,3.85,27,0.0,14.38,20.0,23.12,29.75,1.72,23.82,29.78,34.47,45.47,0.0,0.53,0.65,0.79,1.06,9.2,12.55,16.0,19.6,29.9
Kyle Anderson,4400,4.15,27,2.5,15.62,18.25,23.12,30.75,12.67,18.92,23.05,25.29,27.38,0.2,0.68,0.78,0.96,1.32,6.0,11.8,13.9,16.15,24.0
Nickeil Alexander-Walker,3900,3.46,27,5.0,9.5,13.5,18.0,28.25,12.83,17.36,18.88,21.42,33.08,0.26,0.47,0.74,0.9,1.61,5.5,10.75,12.1,16.3,23.2
Leonard Miller,3000,3.12,2,8.75,9.06,9.38,9.69,10.0,4.53,5.36,6.19,7.02,7.85,1.27,1.44,1.6,1.77,1.93,29.0,29.1,29.2,29.3,29.4
Shake Milton,3000,2.38,20,-0.5,1.25,7.12,13.06,20.0,1.12,3.41,11.54,15.47,22.73,-0.14,0.26,0.78,0.99,1.7,0.0,12.85,17.9,24.62,36.8


In [31]:
szn2023.performance_with_starting('Andrew Nembhard', 'Buddy Hield', 'Bennedict Mathurin', 'Jalen Smith', 'Myles Turner', team='IND', stats=['usg'])

Sample size with Andrew Nembhard, Buddy Hield, Bennedict Mathurin, Jalen Smith, Myles Turner all starting: 1 games.


Unnamed: 0_level_0,salary,fpts_1k,fpts,fpts,fpts,fpts,fpts,fpts,mp,mp,mp,mp,mp,fppm,fppm,fppm,fppm,fppm,usg,usg,usg,usg,usg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Jarace Walker,3600,8.75,1,31.5,31.5,31.5,31.5,31.5,26.18,26.18,26.18,26.18,26.18,1.2,1.2,1.2,1.2,1.2,18.5,18.5,18.5,18.5,18.5
Jalen Smith,5400,4.81,1,26.0,26.0,26.0,26.0,26.0,19.4,19.4,19.4,19.4,19.4,1.34,1.34,1.34,1.34,1.34,25.0,25.0,25.0,25.0,25.0
TJ McConnell,5900,3.77,1,22.25,22.25,22.25,22.25,22.25,18.65,18.65,18.65,18.65,18.65,1.19,1.19,1.19,1.19,1.19,21.4,21.4,21.4,21.4,21.4
Andrew Nembhard,5800,3.71,1,21.5,21.5,21.5,21.5,21.5,23.08,23.08,23.08,23.08,23.08,0.93,0.93,0.93,0.93,0.93,23.3,23.3,23.3,23.3,23.3
Buddy Hield,6100,3.11,1,19.0,19.0,19.0,19.0,19.0,22.72,22.72,22.72,22.72,22.72,0.84,0.84,0.84,0.84,0.84,16.8,16.8,16.8,16.8,16.8
Myles Turner,6900,2.39,1,16.5,16.5,16.5,16.5,16.5,17.15,17.15,17.15,17.15,17.15,0.96,0.96,0.96,0.96,0.96,27.7,27.7,27.7,27.7,27.7
Oscar Tshiebwe,3000,5.17,1,15.5,15.5,15.5,15.5,15.5,12.0,12.0,12.0,12.0,12.0,1.29,1.29,1.29,1.29,1.29,19.8,19.8,19.8,19.8,19.8
Bennedict Mathurin,6300,2.42,1,15.25,15.25,15.25,15.25,15.25,25.45,25.45,25.45,25.45,25.45,0.6,0.6,0.6,0.6,0.6,16.5,16.5,16.5,16.5,16.5
Jordan Nwora,3500,3.79,1,13.25,13.25,13.25,13.25,13.25,16.72,16.72,16.72,16.72,16.72,0.79,0.79,0.79,0.79,0.79,28.7,28.7,28.7,28.7,28.7
Kendall Brown,3000,4.0,1,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,1.0,1.0,1.0,1.0,1.0,25.7,25.7,25.7,25.7,25.7


In [39]:
szn2023.performance_without_2('Tyrese Haliburton', 'Andrew Nembhard', team='IND')

Sample size with Tyrese Haliburton, Andrew Nembhard missing: 1 games.


Unnamed: 0_level_0,salary,fpts_1k,fpts,fpts,fpts,fpts,fpts,fpts,mp,mp,mp,mp,mp,fppm,fppm,fppm,fppm,fppm
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Isaiah Jackson,4200,7.44,1,31.25,31.25,31.25,31.25,31.25,19.57,19.57,19.57,19.57,19.57,1.6,1.6,1.6,1.6,1.6
TJ McConnell,5900,5.0,1,29.5,29.5,29.5,29.5,29.5,25.67,25.67,25.67,25.67,25.67,1.15,1.15,1.15,1.15,1.15
Bruce Brown,5400,5.05,1,27.25,27.25,27.25,27.25,27.25,27.05,27.05,27.05,27.05,27.05,1.01,1.01,1.01,1.01,1.01
Aaron Nesmith,4900,5.36,1,26.25,26.25,26.25,26.25,26.25,26.23,26.23,26.23,26.23,26.23,1.0,1.0,1.0,1.0,1.0
Myles Turner,6900,2.93,1,20.25,20.25,20.25,20.25,20.25,21.1,21.1,21.1,21.1,21.1,0.96,0.96,0.96,0.96,0.96
Jarace Walker,3600,4.44,1,16.0,16.0,16.0,16.0,16.0,24.2,24.2,24.2,24.2,24.2,0.66,0.66,0.66,0.66,0.66
Ben Sheppard,3300,4.77,1,15.75,15.75,15.75,15.75,15.75,16.97,16.97,16.97,16.97,16.97,0.93,0.93,0.93,0.93,0.93
Oscar Tshiebwe,3000,4.83,1,14.5,14.5,14.5,14.5,14.5,7.33,7.33,7.33,7.33,7.33,1.98,1.98,1.98,1.98,1.98
Obi Toppin,4700,2.87,1,13.5,13.5,13.5,13.5,13.5,16.82,16.82,16.82,16.82,16.82,0.8,0.8,0.8,0.8,0.8
Bennedict Mathurin,6300,2.1,1,13.25,13.25,13.25,13.25,13.25,23.38,23.38,23.38,23.38,23.38,0.57,0.57,0.57,0.57,0.57


In [None]:
szn2023.performance_with_starting('Spencer Dinwiddie', 'Mikal Bridges', 'Cameron Johnson', 'Dorian Finney-Smith', 'Nic Claxton', team='SAC', stats=['usg'])

In [38]:
szn2023.performance_with_starting('Domantas Sabonis', "De'Aaron Fox", 'Keegan Murray', 'Kevin Huerter', 'Harrison Barnes', team='SAC', stats=['usg'])

Sample size with Domantas Sabonis, De'Aaron Fox, Keegan Murray, Kevin Huerter, Harrison Barnes all starting: 23 games.


Unnamed: 0_level_0,salary,fpts_1k,fpts,fpts,fpts,fpts,fpts,fpts,mp,mp,mp,mp,mp,fppm,fppm,fppm,fppm,fppm,usg,usg,usg,usg,usg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max,min,floor,median,ceiling,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Domantas Sabonis,10000,5.1,23,32.0,46.25,51.0,57.88,73.0,25.73,33.07,35.42,38.17,44.93,1.04,1.34,1.48,1.64,1.98,13.3,20.2,24.5,26.15,30.0
De'Aaron Fox,9100,5.47,23,18.5,41.88,49.75,56.75,69.5,25.03,32.83,34.87,38.72,42.88,0.64,1.13,1.4,1.65,1.73,25.0,30.15,32.9,36.05,40.3
Keegan Murray,6000,5.21,23,7.25,19.5,31.25,36.12,49.25,16.37,30.26,32.22,35.41,39.67,0.37,0.68,0.81,1.08,1.42,12.9,16.15,20.0,20.95,25.3
Malik Monk,6400,4.61,22,14.5,20.12,29.5,36.69,48.75,17.95,22.01,25.58,28.84,36.3,0.66,0.92,1.1,1.24,1.63,16.7,21.48,23.45,25.38,36.5
Trey Lyles,4200,4.14,16,2.5,12.5,17.38,22.5,30.5,11.43,15.9,19.53,25.43,33.1,0.14,0.66,0.84,0.95,1.31,0.0,8.5,11.1,15.55,25.7
Harrison Barnes,4300,4.01,23,2.0,13.5,17.25,21.25,45.0,11.83,23.98,30.73,33.85,38.75,0.17,0.47,0.56,0.67,1.35,5.6,10.35,12.8,15.15,23.2
Kevin Huerter,5200,3.12,23,0.0,11.25,16.25,24.62,50.5,0.68,16.57,23.18,28.95,38.72,0.0,0.58,0.74,0.93,1.34,0.0,14.95,16.1,17.75,27.5
Sasha Vezenkov,3500,2.79,19,2.5,6.25,9.75,17.62,27.25,6.22,9.7,12.38,17.08,20.18,0.29,0.59,0.85,1.0,1.93,6.0,14.8,16.0,20.2,27.0
Alex Len,3000,2.96,8,-1.0,0.0,8.88,12.06,25.5,1.93,4.82,8.97,11.74,15.52,-0.13,0.0,1.02,1.13,1.64,0.0,9.9,15.9,18.58,67.7
JaVale McGee,3000,2.67,18,1.25,5.88,8.0,15.44,22.75,3.83,6.22,8.23,11.89,15.48,0.19,0.82,1.03,1.55,3.49,0.0,13.5,19.45,30.38,86.1


In [None]:
TEAMS = szn2023.CURRENT_TEAMS
TEAMS

In [None]:
def predict_ceilings(**kwargs):
    """
    Goal of this function is to predict which matchups are most likely to produce ceiling outcomes for types of players
    Ceiling will be defined by default as a 75th percetile outcome
    Want to target sample size of at least 3 games for now
    To start, will do starters likely ceiling matchups for fpts and mp
    """

    stat = kwargs.get('stat', 'fpts')
    
    df = SeasonData(year=2023, site='draftkings').load(stats=[stat])
    

    if 'pos' in kwargs:
        positions = kwargs['pos']
        if isinstance(positions, str):
            positions = [positions]

        for pos in positions:
            df[pos] = df['pos'].map(lambda pos_: int(pos in pos_))

        df = (pd
              .concat([df.loc[df[pos_] == 1] for pos_ in positions])
              .drop_duplicates()
             )

    

    startervals = (0,1) if kwargs.get('starter') is None else (int(kwargs['starter']), )

    # print(f'Total names in data before removal for sample sizes: {df.drop_duplicates("name").shape[0]}')

    ceiling_val = kwargs.get('ceiling', 75)
    
    ceiling_stat = (df
                    .loc[df['starter'].isin(startervals)]
                    .groupby('name')
                    [stat]
                    .agg(['count', percentile(ceiling_val)])
                    .set_axis(['count', 'ceiling'], axis=1)
                    .pipe(lambda df_: df_.loc[df_['count'] >= 3])
                   )

    df = df.loc[df['name'].isin(ceiling_stat.index)]
    # print(f'Total names in data after removal for sample sizes: {df.drop_duplicates("name").shape[0]}')
    # TODO: figure out more optimal way to perform this
    df['ceiling'] = df[['name', stat]].apply(lambda row: int( row.iloc[1] >= ceiling_stat.loc[row.iloc[0], 'ceiling'] ), axis=1)

    # return df.sample(25)

    output = [
        f'Percentages for players to have ceiling outcomes:',
        f'    - Stat: {stat}',
        f'    - Ceiling value: {ceiling_val}%',
    ]

    if 'starter' in kwargs:
        output.append(f'    - Players: {"starters" if kwargs["starter"] else "bench"}')
    else:
        output.append(f'    - Players: All')

    print(*output, sep='\n')

    teams = kwargs.get('teams', tuple(df['team'].drop_duplicates()))

    # Return groupby of opponents to see if pattern for where 
    return (df
            .groupby('opp')
            ['ceiling']
            .agg(['count', 'sum'])
            .pipe(lambda df_: df_.loc[df_.index.isin(teams)])
            .assign(ceiling_percent=lambda df_: 100 * df_['sum'] / df_['count'])
            .sort_values('ceiling_percent', ascending=False)
            .round(2)
           )

def predict_floors(**kwargs):
    """
    Goal of this function is to predict which matchups are most likely to produce floor outcomes for types of players
    Floor will be defined by default as a 25th percetile outcome or worse
    Want to target sample size of at least 3 games for now
    To start, will do starters likely ceiling matchups for fpts and mp
    """

    stat = kwargs.get('stat', 'fpts')
    
    df = SeasonData(year=2023, site='draftkings').load(stats=[stat])

    if 'pos' in kwargs:
        positions = kwargs['pos']
        if isinstance(positions, str):
            positions = [positions]

        for pos in positions:
            df[pos] = df['pos'].map(lambda pos_: int(pos in pos_))

        df = (pd
              .concat([df.loc[df[pos_] == 1] for pos_ in positions])
              .drop_duplicates()
             )
    
    startervals = (0,1) if kwargs.get('starter') is None else (int(kwargs['starter']), )

    # print(f'Total names in data before removal for sample sizes: {df.drop_duplicates("name").shape[0]}')

    floor_val = kwargs.get('floor', 25)
    
    floor_stat = (df
                  .loc[df['starter'].isin(startervals)]
                  .groupby('name')
                  [stat]
                  .agg(['count', percentile(floor_val)])
                  .set_axis(['count', 'floor'], axis=1)
                  .pipe(lambda df_: df_.loc[df_['count'] >= 3])
                 )

    df = df.loc[df['name'].isin(floor_stat.index)]
    # print(f'Total names in data after removal for sample sizes: {df.drop_duplicates("name").shape[0]}')
    # TODO: figure out more optimal way to perform this
    df['floor'] = df[['name', stat]].apply(lambda row: int( row.iloc[1] <= floor_stat.loc[row.iloc[0], 'floor'] ), axis=1)

    # return df.sample(25)

    output = [
        f'Percentages for players to have floor outcomes:',
        f'    - Stat: {stat}',
        f'    - Floor value: {floor_val}%',
    ]

    if 'starter' in kwargs:
        output.append(f'    - Players: {"starters" if kwargs["starter"] else "bench"}')
    else:
        output.append(f'    - Players: All')

    # if 'pos' in kwargs:
    #     output.append(f'    - Positions: {","}')
    # else:
    #     output.append(f'    - Players: All')

    print(*output, sep='\n')

    teams = kwargs.get('teams', tuple(df['team'].drop_duplicates()))

    # Return groupby of opponents to see if pattern for where 
    return (df
            .groupby('opp')
            ['floor']
            .agg(['count', 'sum'])
            .pipe(lambda df_: df_.loc[df_.index.isin(teams)])
            .assign(floor_percent=lambda df_: 100 * df_['sum'] / df_['count'])
            .sort_values('floor_percent', ascending=False)
            .round(2)
           )

In [None]:
# good matchups = 
predict_ceilings(starter=True, teams=TEAMS, ceiling=70, stat='fpts') #, pos=['C'])

In [None]:
predict_ceilings(starter=True, teams=TEAMS, ceiling=65, pos=['C'])

In [None]:
predict_floors(starter=True, teams=TEAMS)

In [None]:
# def load_team_analysis(teams: list[str,...], **kwargs) -> pd.DataFrame:
#     """
#     Returns dataframes of just team performances: scores, ratings, etc
#     """
#     site = kwargs.get('site', 'fanduel')
    
#     stats = [
#         'date',
#         'team',
#         'opp',
#         'home',
#         'score',
#         'opp_score',
#         'winner',
#         'spread',
#         'total',
#         'pace',
#         'team-efg_pct',
#         'team-tov_pct',
#         'team-orb_pct',
#         'team-ft_rate',
#         'team-off_rtg',
#         'dk_fpts',
#         'fd_fpts'
#     ]

#     szn = (SeasonData(year=2023, stats=stats)
#            .load(stats=stats)
#            .pipe(lambda df_: df_.loc[df_['team'].isin(TEAMS)])
#           )

#     return szn
    

In [None]:
(szn2023.load()
 # .pipe(lambda df_: df_.loc[df_['starter'] == 1])
 .pipe(lambda df_: df_.loc[
       (df_['opp'].isin(TEAMS))
       # & (df_['starter'] == 1)
       ])
 .groupby('opp')
 [['fpts', 'fppm', 'pace', 'total']]
 .agg(['count', percentile(25), 'median', percentile(75), 'max', 'sum'])
 .sort_values(('pace', 'sum'), ascending=False)
)

In [None]:
(szn2023.load()
 .groupby('team')
 ['tov']
 .agg(['sum'])
 .sort_values('sum')
 .plot
 .barh(figsize=(15,10))
);

In [None]:
class Opponent:
    def __init__(self, team: str):
        """
        Will contain data for all games played against a certain team
        """

        self.data = (szn2022.load()
                     .pipe(lambda df_: df_.loc[df_['opp'] == team])
                    )

        self.starters = (self.data
                         .loc[self.data['starter'] == 1]
                        )
        return None

class Team:
    def __init__(self, team: str):
        """
        Will contain data for all games played by a certain team
        """

        self.data = (szn2022.load()
                     .pipe(lambda df_: df_.loc[df_['team'] == team])
                    )

        self.starters = (self.data
                         .loc[self.data['starter'] == 1]
                        )
        return None




In [None]:
pd.options.display.max_rows = 100

In [None]:
SEASONS = (pd.concat([szn.load() for szn in (szn2021, szn2022, szn2023)]))
# SEASONS = szn2023.load() #(pd.concat([szn.load() for szn in (szn2021, szn2022, szn2023)]))

In [None]:
def player_profile(name: str, **kwargs):

    starter_ = (0,1) if kwargs.get('starter') is None else (int(kwargs['starter']),)

    szn_data = szn2023.load() if kwargs.get('current', False) else pd.concat([szn.load() for szn in (szn2021, szn2022, szn2023)])

    player_df = (szn_data
                 .loc[(szn_data['name'] == name) & (szn_data['starter'].isin(starter_))]
                 .assign(
                     core=lambda df_: df_.fpts - 3*(df_.stl + df_.blk),
                     cfppm=lambda df_: df_.core / df_.mp
                 )
                 # .drop('playoffs', axis=1)
                )

    stats = ['fpts', 'fppm', 'mp', 'core', 'cfppm']
    
    if kwargs.get('raw', False):
        return player_df
    
    return (player_df
            .groupby('name')
            [stats]
            .agg(['count', percentile(25), 'median', percentile(75)])
            .drop([(stat_, 'count') for stat_ in stats if stat_ != 'fpts'], axis=1)
           )

def player_profiles(*names, **kwargs):

    return (pd
            .concat([player_profile(name, **kwargs) for name in names])
           )

In [None]:
player_profiles('Jontay Porter', starter=False, current=True)

In [None]:
player_profiles('Chris Boucher', 'Thaddeus Young', starter=True, current=False)

In [None]:
player_profile('Chris Boucher', starter=True, current=True)

In [None]:
player_profile('Jordan Clarkson', starter=False, current=True)

In [None]:
import itertools

from collections.abc import Sequence

def correlations(names: Sequence[str,...], *args, **kwargs) -> pd.DataFrame:
    """
    Takes a sequence of names as input
    Returns pd.DataFrame of correlations between players in 2 different columns
        - Correlation in which all of players in sequence played
        - Correlation in which both players played for all combos of players
    """
    startervals = (0,1) if kwargs.get('starter') is None else (int(kwargs['starter']), )
    
    df = (SEASONS
          .pipe(lambda df_: df_.loc[(df_['name'].isin(names)) & (df_['starter'].isin(startervals))])
         )

    stat = kwargs.get('stat', 'fpts')
    team = (df
            .groupby('team')
            [['name']]
            .agg(['count'])
            .sort_values(('name', 'count'), ascending=False)
            .index[0]
           )

    if 'team' in kwargs:
        team = kwargs['team']

    df = df.loc[df['team'] == team]

    dates_together = tuple(df
                           .groupby('date')
                           [['date']]
                           .agg(['count'])
                           .pipe(lambda df_: df_.loc[df_[('date', 'count')] == len(names)])
                           .index
                          )

    if kwargs.get('wout_dates') is not None:
        dates_together = tuple([date for date in dates_together if date in kwargs['wout_dates']])

    # Dictionary indexed by name of all players in names and their stat series
    name_stat: dict[str, pd.Series] = {
        name: (df
               .pipe(lambda df_: df_.loc[(df_['date'].isin(dates_together)) & (df_['name'] == name)])
               .sort_values('date')
               [stat]
              )
        for name in names
    }

    player_corrs = {
        'team': list(),
        'N-names': list(),
        'N-combo': list(),
        'combo': list(),
        'corr': list(),
        'combo-corr': list(),
    }


    total_sample_size = len(dates_together)


    for combo in itertools.combinations(names, 2):
        p1, p2 = combo
        p1_stats, p2_stats = [name_stat[name_] for name_ in combo]

        combo_dates_together = tuple(df
                                     .loc[df['name'].isin(combo)]
                                     .groupby('date')[['date']]
                                     .agg(['count'])
                                     .pipe(lambda df_: df_.loc[df_[('date', 'count')] == 2])
                                     .index
                                    )

        combo_sample_size = len(combo_dates_together)

        combo_df = (df
                    .loc[(df['date'].isin(combo_dates_together))]
                   )

        p1_combo_stats, p2_combo_stats = [
            (combo_df
             .loc[combo_df['name'] == name_]
             .sort_values('date')
             [stat]
            )
            for name_ in combo
        ]

        player_corrs['team'].append(team)

        player_corrs['N-names'].append(total_sample_size)
        player_corrs['N-combo'].append(combo_sample_size)
        
        player_corrs['combo'].append(combo)
        player_corrs['corr'].append(np.corrcoef(p1_stats, p2_stats)[0][1])
        player_corrs['combo-corr'].append(np.corrcoef(p1_combo_stats, p2_combo_stats)[0][1])

    return (pd
            .DataFrame(player_corrs)
            .sort_values('combo-corr', ascending=False)
           )
    

In [None]:
atl_names = ['Trae Young', 'Dejounte Murray', 'Clint Capela', "De'Andre Hunter", 'Bogdan Bogdanovic', 'Jalen Johnson', 'Saddiq Bey']
bos_names = ['Jayson Tatum', 'Jaylen Brown', 'Al Horford', 'Derrick White']
chi_names = ['Coby White', 'Nikola Vucevic', 'DeMar DeRozan', 'Zach LaVine']
cle_names = ['Donovan Mitchell', 'Darius Garland', 'Evan Mobley', 'Jarrett Allen', 'Caris LeVert']
mil_names = ['Giannis Antetokounmpo', 'Brook Lopez', 'Bobby Portis']
gs_names = ['Klay Thompson', 'Andrew Wiggins', 'Draymond Green']

ny_names = ['Jalen Brunson', 'Julius Randle', 'RJ Barrett', 'Josh Hart', 'Mitchell Robinson']
okc_names = ['Jalen Williams', 'Josh Giddey', 'Shai Gilgeous-Alexander']

tor_names = ['Scottie Barnes', 'Pascal Siakam', 'OG Anunoby']
was_names = ['Deni Avdija', 'Kyle Kuzma']
sac_names = ["De'Aaron Fox", 'Kevin Huerter', 'Malik Monk', 'Domantas Sabonis', 'Harrison Barnes', 'Keegan Murray']


In [None]:
BOS_WOUT_DATES = sum([
    szn.dates_without("Kristaps Porzingis", 'BOS') for szn in (szn2021, szn2022, szn2023)
], tuple())

In [None]:
correlations(['Jayson Tatum', 'Jaylen Brown', 'Derrick White', 'Al Horford', 'Jrue Holiday', 'Sam Hauser', 'Payton Pritchard'], team='BOS', wout_dates=BOS_WOUT_DATES)

In [None]:
correlations(['Nikola Jokic', 'Michael Porter', 'Reggie Jackson'], starter=True)

In [None]:

correlations(['Miles Bridges', 'LaMelo Ball'], wout_dates=)

In [None]:
correlations(['Austin Reaves', 'Anthony Davis'])

In [None]:
correlations(['Jarrett Allen', 'Darius Garland', 'Max Strus'])

In [None]:
BOS_WOUT_DATES = sum([
    szn.dates_without("De'Aaron Fox", 'SAC') for szn in (szn2021, szn2022, szn2023)
], tuple())

In [None]:
# MIA_WOUT_DATES = tuple(sorted(set(flatten(sorted(sum([
#     tuple([szn.dates_without('Jimmy Butler', 'MIA') for szn in (szn2021, szn2022, szn2023)]),
#     tuple([szn.dates_without('Tyler Herro', 'MIA') for szn in (szn2021, szn2022, szn2023)])
# ], tuple()))))))

In [None]:
(szn2023.load()
 .pipe(lambda df_: df_.loc[df_['team'].isin(TEAMS)])
 .groupby('team')
 ['tov']
 .sum()
 .sort_values()
 .plot
 .barh(figsize=(15,10))
);

In [None]:
(pd
 .concat([correlations(names_) for names_ in [atl_names, ind_names, mil_names, orl_names]])
 .sort_values('combo-corr', ascending=False)
)