In [1]:
import pandas as pd
import psycopg2
from datetime import date

# create postgresql connection
from sqlalchemy import create_engine
from creds import postgresql_pw

engine = create_engine(f'postgresql+psycopg2://postgres:{postgresql_pw}@localhost:5432/api_sports')

In [2]:
# retrieve games-venues view in api-sports db
query = f'''
    SELECT *
    FROM v_all_game_venues
    ORDER BY game_id
    '''

games_df = pd.read_sql(query, engine)
print(f'{len(games_df)} games retrieved')
games_df.head()

6377 games retrieved


Unnamed: 0,league,game_id,date,time,home_id,home_name,away_id,away_name,venue_name,city,state,full_address,lat,lon
0,mlb,mlb_152753,2024-02-22,20:10,230,San Diego Padres,218,Los Angeles Dodgers,Petco Park,San Diego,California,"100 Park Boulevard, San Diego, CA 92101",32.707188,-117.156877
1,mlb,mlb_152754,2024-02-23,20:05,206,Chicago Cubs,207,Chicago White Sox,Wrigley Field,Chicago,Illinois,"1060 West Addison, Chicago, IL 60613-4397",41.947568,-87.656523
2,mlb,mlb_152755,2024-02-23,20:08,218,Los Angeles Dodgers,230,San Diego Padres,Dodger Stadium,Los Angeles,California,"1000 Vin Scully Avenue, Los Angeles, CA 90012",34.072355,-118.248102
3,mlb,mlb_152756,2024-02-23,20:05,235,Texas Rangers,216,Kansas City Royals,Globe Life Field,Arlington,Texas,"734 Stadium Drive, Arlington, TX 76011",32.745781,-97.0825
4,mlb,mlb_152757,2024-02-23,20:10,210,Colorado Rockies,202,Arizona Diamondbacks,Coors Field,Denver,Colorado,"2001 Blake Street, Denver, CO 80205-2000",39.75622,-104.99307


In [3]:
# grouping by city and date
# ! see note below, this is where I'd group by league as well to give each league a different decimal place
# ! I'd then need to add a column with the new numeric value, then run groupby again, this time only on city, date
city_date_df = games_df.groupby(by=['city', 'date'])['home_name'].nunique().reset_index()
city_date_df = city_date_df.rename(columns={'home_name': 'n_teams'})
city_date_game_count = city_date_df.sort_values(by=['city', 'date'])
city_date_game_count['date'] = city_date_game_count['date'].astype(str)
city_date_game_count.head(5)

Unnamed: 0,city,date,n_teams
0,Anaheim,2024-09-27,1
1,Anaheim,2024-10-01,1
2,Anaheim,2024-10-03,1
3,Anaheim,2024-10-17,1
4,Anaheim,2024-10-21,1


In [4]:
# min and max dates in games data
min_date = city_date_game_count['date'].min()
max_date = city_date_game_count['date'].max()
print(f'min date: {min_date}, max date: {max_date}')

# city count
city_count = city_date_game_count['city'].nunique()
print(f'city count: {city_count}')

min date: 2024-02-22, max date: 2025-04-17
city count: 83


In [5]:
# create list of dates that has each possible date in range
# extend that list however many times that there are cities
dates_idx = pd.date_range(start=min_date, end=max_date)
num_dates = len(dates_idx)
print(f'number of dates: {num_dates}')
for _ in range(city_count-1):
    dates_idx = dates_idx.append(pd.date_range(start=min_date, end=max_date))
print(f'length of date index: {len(dates_idx)}')

number of dates: 421
length of date index: 34943


In [6]:
# create list of cities in game data
# extend to have N of each city equal to number of dates in range
cities = list(set(city_date_game_count['city']))
cities_mult = []
for city in cities:
    city_mult = [city] * num_dates
    cities_mult.extend(city_mult)
len(cities_mult)

34943

In [7]:
# create skeleton dataframe for all cities-dates
cities_dates_mult = pd.DataFrame({'city':cities_mult, 'date':dates_idx}).sort_values(['city', 'date']).reset_index(drop=True)
cities_dates_mult['date'] = cities_dates_mult['date'].astype(str)
cities_dates_mult.dtypes

city    object
date    object
dtype: object

In [8]:
# merge dataframes to have number of teams active for every city-date
city_date_game_count_df = cities_dates_mult.merge(city_date_game_count, how='left', on=['city','date'])
city_date_game_count_df['n_teams'] = city_date_game_count_df['n_teams'].fillna(0)
city_date_game_count_df

Unnamed: 0,city,date,n_teams
0,Anaheim,2024-02-22,0.0
1,Anaheim,2024-02-23,0.0
2,Anaheim,2024-02-24,0.0
3,Anaheim,2024-02-25,0.0
4,Anaheim,2024-02-26,0.0
...,...,...,...
34938,Winnipeg,2025-04-13,1.0
34939,Winnipeg,2025-04-14,0.0
34940,Winnipeg,2025-04-15,0.0
34941,Winnipeg,2025-04-16,1.0


In [9]:
# create column to calculate total teams in previous 5 days
# figured out how to handle the transitions between cities
city_date_game_count_df['five_day_total'] = city_date_game_count_df.groupby(['city'])['n_teams']\
                                            .rolling(window=5, min_periods=1).sum().reset_index(0, drop=True)
city_date_game_count_df

Unnamed: 0,city,date,n_teams,five_day_total
0,Anaheim,2024-02-22,0.0,0.0
1,Anaheim,2024-02-23,0.0,0.0
2,Anaheim,2024-02-24,0.0,0.0
3,Anaheim,2024-02-25,0.0,0.0
4,Anaheim,2024-02-26,0.0,0.0
...,...,...,...,...
34938,Winnipeg,2025-04-13,1.0,1.0
34939,Winnipeg,2025-04-14,0.0,1.0
34940,Winnipeg,2025-04-15,0.0,1.0
34941,Winnipeg,2025-04-16,1.0,2.0


## the Unique Team Problem
ffs, need to figure out how to count unique teams...can I leave this until later?

without additional handling, an MLB team could count for 5 games in 5 days, but it should really only be 1

could I use different decimal places for each league/team, then the rolling count would show the result in the decimals?
there's a place I could make that change noted above

how do I then make that actionable though

also less of a pressing issue considering MLB season is almost over (currently 9/24), but NHL and NBA can still impact.

In [10]:
# testing top date ranges
city_date_game_count_df.sort_values('five_day_total', ascending=False)

Unnamed: 0,city,date,n_teams,five_day_total
5919,Chicago,2024-03-18,1.0,8.0
5918,Chicago,2024-03-17,1.0,8.0
6114,Chicago,2024-09-29,3.0,8.0
16772,Los Angeles,2025-02-09,2.0,8.0
16773,Los Angeles,2025-02-10,0.0,7.0
...,...,...,...,...
17842,Miami Gardens,2024-07-31,0.0,0.0
17841,Miami Gardens,2024-07-30,0.0,0.0
17840,Miami Gardens,2024-07-29,0.0,0.0
17839,Miami Gardens,2024-07-28,0.0,0.0


In [11]:
test_df = city_date_game_count_df[city_date_game_count_df['date']>'2024-09-24']
test_df[~test_df['city'].isin(['Los Angeles', 'New York'])].sort_values('five_day_total', ascending=False).head(20)

Unnamed: 0,city,date,n_teams,five_day_total
6114,Chicago,2024-09-29,3.0,8.0
9481,Detroit,2024-09-28,2.0,7.0
9482,Detroit,2024-09-29,1.0,7.0
34320,Washington,2024-09-28,2.0,7.0
34321,Washington,2024-09-29,1.0,7.0
6115,Chicago,2024-09-30,0.0,7.0
9484,Detroit,2024-10-01,1.0,6.0
9242,Denver,2025-03-29,2.0,6.0
9159,Denver,2025-01-05,2.0,6.0
6113,Chicago,2024-09-28,1.0,6.0


## Next Steps
How should I present/represent this?

What format makes this usable and useful? Can Tableau handle this in some form? Need to be able to quickly see more details about a given timeframe.

I think this would involve giving Tableau the raw games-venues data as 1 data source and the window counts as a 2nd data source, then using the target dates from the window counts as parameters to filter the raw data. That seems doable.