In [1]:
import pandas as pd
import psycopg2
from datetime import date

# create postgresql connection
from sqlalchemy import create_engine
from creds import postgresql_pw

engine = create_engine(f'postgresql+psycopg2://postgres:{postgresql_pw}@localhost:5432/api_sports')

In [2]:
# retrieve games-venues view in api-sports db
query = f'''
    SELECT *
    FROM v_all_game_venues
    ORDER BY game_id
    '''

games_df = pd.read_sql(query, engine)
print(f'{len(games_df)} games retrieved')
games_df.head()

6377 games retrieved


Unnamed: 0,league,game_id,date,time,home_id,home_name,away_id,away_name,venue_name,city,state,full_address,lat,lon,metro_area
0,mlb,mlb_152753,2024-02-22,20:10,230,San Diego Padres,218,Los Angeles Dodgers,Petco Park,San Diego,California,"100 Park Boulevard, San Diego, CA 92101",32.707188,-117.156877,"San Diego, California"
1,mlb,mlb_152754,2024-02-23,20:05,206,Chicago Cubs,207,Chicago White Sox,Wrigley Field,Chicago,Illinois,"1060 West Addison, Chicago, IL 60613-4397",41.947568,-87.656523,"Chicago, Illinois"
2,mlb,mlb_152755,2024-02-23,20:08,218,Los Angeles Dodgers,230,San Diego Padres,Dodger Stadium,Los Angeles,California,"1000 Vin Scully Avenue, Los Angeles, CA 90012",34.072355,-118.248102,"Los Angeles, California"
3,mlb,mlb_152756,2024-02-23,20:05,235,Texas Rangers,216,Kansas City Royals,Globe Life Field,Arlington,Texas,"734 Stadium Drive, Arlington, TX 76011",32.745781,-97.0825,"Dallas, Texas"
4,mlb,mlb_152757,2024-02-23,20:10,210,Colorado Rockies,202,Arizona Diamondbacks,Coors Field,Denver,Colorado,"2001 Blake Street, Denver, CO 80205-2000",39.75622,-104.99307,"Denver, Colorado"


In [3]:
# grouping by city and date
# ! see note below, this is where I'd group by league as well to give each league a different decimal place
# ! I'd then need to add a column with the new numeric value, then run groupby again, this time only on metro_area, date
city_date_df = games_df.groupby(by=['metro_area', 'date'])['home_name'].nunique().reset_index()
city_date_df = city_date_df.rename(columns={'home_name': 'n_teams'})
city_date_game_count = city_date_df.sort_values(by=['metro_area', 'date'])
city_date_game_count['date'] = city_date_game_count['date'].astype(str)
city_date_game_count.head(5)

Unnamed: 0,metro_area,date,n_teams
0,"Atlanta, Georgia",2024-02-25,1
1,"Atlanta, Georgia",2024-02-26,1
2,"Atlanta, Georgia",2024-02-29,1
3,"Atlanta, Georgia",2024-03-01,1
4,"Atlanta, Georgia",2024-03-03,1


In [4]:
# min and max dates in games data
min_date = city_date_game_count['date'].min()
max_date = city_date_game_count['date'].max()
print(f'min date: {min_date}, max date: {max_date}')

# city count
city_count = city_date_game_count['metro_area'].nunique()
print(f'city count: {city_count}')

min date: 2024-02-22, max date: 2025-04-17
city count: 50


In [5]:
# create list of dates that has each possible date in range
# extend that list however many times that there are cities
dates_idx = pd.date_range(start=min_date, end=max_date)
num_dates = len(dates_idx)
print(f'number of dates: {num_dates}')
for _ in range(city_count-1):
    dates_idx = dates_idx.append(pd.date_range(start=min_date, end=max_date))
print(f'length of date index: {len(dates_idx)}')

number of dates: 421
length of date index: 21050


In [6]:
# create list of cities in game data
# extend to have N of each city equal to number of dates in range
cities = list(set(city_date_game_count['metro_area']))
cities_mult = []
for city in cities:
    city_mult = [city] * num_dates
    cities_mult.extend(city_mult)
len(cities_mult)

21050

In [7]:
# create skeleton dataframe for all cities-dates
cities_dates_mult = pd.DataFrame({'metro_area':cities_mult, 'date':dates_idx}).sort_values(['metro_area', 'date']).reset_index(drop=True)
cities_dates_mult['date'] = cities_dates_mult['date'].astype(str)
cities_dates_mult.dtypes

metro_area    object
date          object
dtype: object

In [8]:
# merge dataframes to have number of teams active for every city-date
city_date_game_count_df = cities_dates_mult.merge(city_date_game_count, how='left', on=['metro_area','date'])
city_date_game_count_df['n_teams'] = city_date_game_count_df['n_teams'].fillna(0)
city_date_game_count_df

Unnamed: 0,metro_area,date,n_teams
0,"Atlanta, Georgia",2024-02-22,0.0
1,"Atlanta, Georgia",2024-02-23,0.0
2,"Atlanta, Georgia",2024-02-24,0.0
3,"Atlanta, Georgia",2024-02-25,1.0
4,"Atlanta, Georgia",2024-02-26,1.0
...,...,...,...
21045,"Winnipeg, Manitoba",2025-04-13,1.0
21046,"Winnipeg, Manitoba",2025-04-14,0.0
21047,"Winnipeg, Manitoba",2025-04-15,0.0
21048,"Winnipeg, Manitoba",2025-04-16,1.0


In [9]:
# create column to calculate total teams in previous 5 days
# figured out how to handle the transitions between cities
city_date_game_count_df['five_day_total'] = city_date_game_count_df.groupby(['metro_area'])['n_teams']\
                                            .rolling(window=5, min_periods=1).sum().reset_index(0, drop=True)
city_date_game_count_df

Unnamed: 0,metro_area,date,n_teams,five_day_total
0,"Atlanta, Georgia",2024-02-22,0.0,0.0
1,"Atlanta, Georgia",2024-02-23,0.0,0.0
2,"Atlanta, Georgia",2024-02-24,0.0,0.0
3,"Atlanta, Georgia",2024-02-25,1.0,1.0
4,"Atlanta, Georgia",2024-02-26,1.0,2.0
...,...,...,...,...
21045,"Winnipeg, Manitoba",2025-04-13,1.0,1.0
21046,"Winnipeg, Manitoba",2025-04-14,0.0,1.0
21047,"Winnipeg, Manitoba",2025-04-15,0.0,1.0
21048,"Winnipeg, Manitoba",2025-04-16,1.0,2.0


## the Unique Team Problem
ffs, need to figure out how to count unique teams...can I leave this until later?

without additional handling, an MLB team could count for 5 games in 5 days, but it should really only be 1

could I use different decimal places for each league/team, then the rolling count would show the result in the decimals?
there's a place I could make that change noted above

how do I then make that actionable though

also less of a pressing issue considering MLB season is almost over (currently 9/24), but NHL and NBA can still impact.

In [10]:
# create column for beginning of date range
city_date_game_count_df['date'] = pd.to_datetime(city_date_game_count_df['date']).dt.date
city_date_game_count_df['start_date'] = city_date_game_count_df['date'] - pd.Timedelta('5 days')
city_date_game_count_df.rename(columns={'date':'end_date', 'n_teams':'end_date_games'}, inplace=True)
city_date_game_count_df = city_date_game_count_df[['metro_area', 'start_date', 'end_date', 'end_date_games', 'five_day_total']]
city_date_game_count_df

Unnamed: 0,metro_area,start_date,end_date,end_date_games,five_day_total
0,"Atlanta, Georgia",2024-02-17,2024-02-22,0.0,0.0
1,"Atlanta, Georgia",2024-02-18,2024-02-23,0.0,0.0
2,"Atlanta, Georgia",2024-02-19,2024-02-24,0.0,0.0
3,"Atlanta, Georgia",2024-02-20,2024-02-25,1.0,1.0
4,"Atlanta, Georgia",2024-02-21,2024-02-26,1.0,2.0
...,...,...,...,...,...
21045,"Winnipeg, Manitoba",2025-04-08,2025-04-13,1.0,1.0
21046,"Winnipeg, Manitoba",2025-04-09,2025-04-14,0.0,1.0
21047,"Winnipeg, Manitoba",2025-04-10,2025-04-15,0.0,1.0
21048,"Winnipeg, Manitoba",2025-04-11,2025-04-16,1.0,2.0


In [11]:
# testing top date ranges
city_date_game_count_df.sort_values('five_day_total', ascending=False)

Unnamed: 0,metro_area,start_date,end_date,end_date_games,five_day_total
12500,"New York City, New York",2024-12-04,2024-12-09,1.0,11.0
12429,"New York City, New York",2024-09-24,2024-09-29,2.0,11.0
12428,"New York City, New York",2024-09-23,2024-09-28,2.0,11.0
12607,"New York City, New York",2025-03-21,2025-03-26,3.0,11.0
12502,"New York City, New York",2024-12-06,2024-12-11,2.0,11.0
...,...,...,...,...,...
9381,"Memphis, Tennessee",2024-06-15,2024-06-20,0.0,0.0
9382,"Memphis, Tennessee",2024-06-16,2024-06-21,0.0,0.0
9383,"Memphis, Tennessee",2024-06-17,2024-06-22,0.0,0.0
9384,"Memphis, Tennessee",2024-06-18,2024-06-23,0.0,0.0


## Integrating with original games data to make 1 table for export

In [12]:
games_df.head(1)

Unnamed: 0,league,game_id,date,time,home_id,home_name,away_id,away_name,venue_name,city,state,full_address,lat,lon,metro_area
0,mlb,mlb_152753,2024-02-22,20:10,230,San Diego Padres,218,Los Angeles Dodgers,Petco Park,San Diego,California,"100 Park Boulevard, San Diego, CA 92101",32.707188,-117.156877,"San Diego, California"


In [13]:
games_df['earliest_start_date'] = games_df['date'] - pd.Timedelta('4 days')
games_df['latest_end_date'] = games_df['date'] + pd.Timedelta('4 days')
games_df.head(1)

Unnamed: 0,league,game_id,date,time,home_id,home_name,away_id,away_name,venue_name,city,state,full_address,lat,lon,metro_area,earliest_start_date,latest_end_date
0,mlb,mlb_152753,2024-02-22,20:10,230,San Diego Padres,218,Los Angeles Dodgers,Petco Park,San Diego,California,"100 Park Boulevard, San Diego, CA 92101",32.707188,-117.156877,"San Diego, California",2024-02-18,2024-02-26


In [14]:
full_join_df = city_date_game_count_df.merge(games_df, on='metro_area')
print(len(full_join_df))
full_join_df.head(1)

2677981


Unnamed: 0,metro_area,start_date,end_date,end_date_games,five_day_total,league,game_id,date,time,home_id,...,away_id,away_name,venue_name,city,state,full_address,lat,lon,earliest_start_date,latest_end_date
0,"Atlanta, Georgia",2024-02-17,2024-02-22,0.0,0.0,mlb,mlb_152789,2024-02-25,18:05,203,...,205,Boston Red Sox,Truist Park,Cumberland,Georgia,"755 Battery Avenue, Atlanta, GA 30339",33.890709,-84.468534,2024-02-21,2024-02-29


In [15]:
# filters df to associate each game with all date ranges that the game falls in
games_counts_df = full_join_df[(full_join_df['start_date']>=full_join_df['earliest_start_date']) &
                               (full_join_df['end_date']<=full_join_df['latest_end_date'])]\
                               .sort_values(['metro_area', 'date'])
print(len(games_counts_df))
games_counts_df.head(5)

25360


Unnamed: 0,metro_area,start_date,end_date,end_date_games,five_day_total,league,game_id,date,time,home_id,...,away_id,away_name,venue_name,city,state,full_address,lat,lon,earliest_start_date,latest_end_date
656,"Atlanta, Georgia",2024-02-21,2024-02-26,1.0,2.0,mlb,mlb_152789,2024-02-25,18:05,203,...,205,Boston Red Sox,Truist Park,Cumberland,Georgia,"755 Battery Avenue, Atlanta, GA 30339",33.890709,-84.468534,2024-02-21,2024-02-29
820,"Atlanta, Georgia",2024-02-22,2024-02-27,0.0,2.0,mlb,mlb_152789,2024-02-25,18:05,203,...,205,Boston Red Sox,Truist Park,Cumberland,Georgia,"755 Battery Avenue, Atlanta, GA 30339",33.890709,-84.468534,2024-02-21,2024-02-29
984,"Atlanta, Georgia",2024-02-23,2024-02-28,0.0,2.0,mlb,mlb_152789,2024-02-25,18:05,203,...,205,Boston Red Sox,Truist Park,Cumberland,Georgia,"755 Battery Avenue, Atlanta, GA 30339",33.890709,-84.468534,2024-02-21,2024-02-29
1148,"Atlanta, Georgia",2024-02-24,2024-02-29,1.0,3.0,mlb,mlb_152789,2024-02-25,18:05,203,...,205,Boston Red Sox,Truist Park,Cumberland,Georgia,"755 Battery Avenue, Atlanta, GA 30339",33.890709,-84.468534,2024-02-21,2024-02-29
821,"Atlanta, Georgia",2024-02-22,2024-02-27,0.0,2.0,mlb,mlb_152799,2024-02-26,18:05,203,...,204,Baltimore Orioles,Truist Park,Cumberland,Georgia,"755 Battery Avenue, Atlanta, GA 30339",33.890709,-84.468534,2024-02-22,2024-03-01


## Next Steps
How should I present/represent this?

Edit 9/25/24: I've combined metro areas and added the earliest start date and latest end date for any date ranges each game belongs to. Combining the 2 dataframes into 1 table for export is more efficient for Tableau.

In [16]:
# exporting results to use in Tableau
games_counts_df.to_csv('tableau/games_counts_data.csv', index=False)

How do I integrate the completed teams/cities into this? Do I need to do it here, or in Tableau, or create different versions of each?

I think 2 different versions. 1 will be for all of the games, 1 will be specifically for my purposes.