In [1]:
import pandas as pd
import psycopg2
from datetime import date

# create postgresql connection
from sqlalchemy import create_engine
from creds import postgresql_pw

engine = create_engine(f'postgresql+psycopg2://postgres:{postgresql_pw}@localhost:5432/api_sports')

In [2]:
# retrieve games-venues view in api-sports db
query = f'''
    SELECT *
    FROM v_all_game_venues
    ORDER BY game_id
    '''

all_games_df = pd.read_sql(query, engine)
print(f'{len(all_games_df)} games retrieved')
print(f'Home teams in dataframe: {all_games_df['home_name'].nunique()}')
all_games_df.head()

6377 games retrieved
Home teams in dataframe: 153


Unnamed: 0,league,game_id,date,time,home_id,home_name,away_id,away_name,venue_name,city,state,full_address,lat,lon,metro_area,metro_team_num
0,mlb,mlb_152753,2024-02-22,14:10,230,San Diego Padres,218,Los Angeles Dodgers,Petco Park,San Diego,California,"100 Park Boulevard, San Diego, CA 92101",32.707188,-117.156877,"San Diego, California",mlb_1
1,mlb,mlb_152754,2024-02-23,14:05,206,Chicago Cubs,207,Chicago White Sox,Wrigley Field,Chicago,Illinois,"1060 West Addison, Chicago, IL 60613-4397",41.947568,-87.656523,"Chicago, Illinois",mlb_1
2,mlb,mlb_152755,2024-02-23,14:08,218,Los Angeles Dodgers,230,San Diego Padres,Dodger Stadium,Los Angeles,California,"1000 Vin Scully Avenue, Los Angeles, CA 90012",34.072355,-118.248102,"Los Angeles, California",mlb_2
3,mlb,mlb_152756,2024-02-23,14:05,235,Texas Rangers,216,Kansas City Royals,Globe Life Field,Arlington,Texas,"734 Stadium Drive, Arlington, TX 76011",32.745781,-97.0825,"Dallas, Texas",mlb_1
4,mlb,mlb_152757,2024-02-23,14:10,210,Colorado Rockies,202,Arizona Diamondbacks,Coors Field,Denver,Colorado,"2001 Blake Street, Denver, CO 80205-2000",39.75622,-104.99307,"Denver, Colorado",mlb_1


In [3]:
# read in completed teams xlsx to filter out irrelevant games before counting
completed_teams_df = pd.read_excel('team_completions.xlsx')
completed_teams_df.head()

Unnamed: 0,league,team,home_id,metro_area,completed_flag
0,mlb,Arizona Diamondbacks,202,"Phoenix, Arizona",0
1,mlb,Atlanta Braves,203,"Atlanta, Georgia",1
2,mlb,Baltimore Orioles,204,"Baltimore, Maryland",0
3,mlb,Boston Red Sox,205,"Boston, Massachusetts",0
4,mlb,Chicago Cubs,206,"Chicago, Illinois",0


In [4]:
# filter games_df on uncompleted teams
uncompleted_teams = completed_teams_df[completed_teams_df['completed_flag']==0]['home_id']
games_df = all_games_df[all_games_df['home_id'].isin(uncompleted_teams)]
print(f'Games remaining in dataframe: {len(games_df)}')
print(f'Home teams remaining in dataframe: {games_df['home_name'].nunique()}')

Games remaining in dataframe: 5370
Home teams remaining in dataframe: 132


In [5]:
# grouping by city and date, pivoting by metro team number to account for multiple games by same team
city_date_df = pd.pivot_table(games_df, values='home_name', index=['metro_area', 'date'],
                              columns='metro_team_num', aggfunc='nunique', fill_value=0).reset_index()

# remove extraneous "metro_team_num" column created by pivot_table
city_date_df.columns = [col for col in city_date_df.columns.values]

city_date_game_count = city_date_df.sort_values(by=['metro_area', 'date'])
city_date_game_count['date'] = city_date_game_count['date'].astype(str)
city_date_game_count.head()

Unnamed: 0,metro_area,date,mlb_1,mlb_2,mls_1,mls_2,nba_1,nba_2,nfl_1,nfl_2,nhl_1,nhl_2,nhl_3
0,"Atlanta, Georgia",2024-03-09,0,0,1,0,0,0,0,0,0,0,0
1,"Atlanta, Georgia",2024-03-17,0,0,1,0,0,0,0,0,0,0,0
2,"Atlanta, Georgia",2024-03-31,0,0,1,0,0,0,0,0,0,0,0
3,"Atlanta, Georgia",2024-04-14,0,0,1,0,0,0,0,0,0,0,0
4,"Atlanta, Georgia",2024-04-20,0,0,1,0,0,0,0,0,0,0,0


In [6]:
# min and max dates in games data
min_date = city_date_game_count['date'].min()
max_date = city_date_game_count['date'].max()
print(f'min date: {min_date}, max date: {max_date}')

# city count
city_count = city_date_game_count['metro_area'].nunique()
print(f'city count: {city_count}')

min date: 2024-02-21, max date: 2025-04-17
city count: 46


In [7]:
# create list of dates that has each possible date in range
# extend that list however many times that there are cities
dates_idx = pd.date_range(start=min_date, end=max_date)
num_dates = len(dates_idx)
print(f'number of dates: {num_dates}')
for _ in range(city_count-1):
    dates_idx = dates_idx.append(pd.date_range(start=min_date, end=max_date))
print(f'length of date index: {len(dates_idx)}')

number of dates: 422
length of date index: 19412


In [8]:
# create list of cities in game data
# extend to have N of each city equal to number of dates in range
cities = list(set(city_date_game_count['metro_area']))
cities_mult = []
for city in cities:
    city_mult = [city] * num_dates
    cities_mult.extend(city_mult)
len(cities_mult)

19412

In [9]:
# create skeleton dataframe for all cities-dates
cities_dates_mult = pd.DataFrame({'metro_area':cities_mult, 'date':dates_idx}).sort_values(['metro_area', 'date']).reset_index(drop=True)
cities_dates_mult['date'] = cities_dates_mult['date'].astype(str)
cities_dates_mult.dtypes

metro_area    object
date          object
dtype: object

In [10]:
# merge dataframes to have number of teams active for every city-date
city_date_game_count_df = cities_dates_mult.merge(city_date_game_count, how='left', on=['metro_area','date'])

# updating logic to make metro_team_num column names dynamic
# handles if all teams of a certain metro_team_num are completed
team_cols = list(city_date_game_count_df.iloc[:,2:].columns.values)
city_date_game_count_df[team_cols] = city_date_game_count_df[team_cols].fillna(0)
city_date_game_count_df

Unnamed: 0,metro_area,date,mlb_1,mlb_2,mls_1,mls_2,nba_1,nba_2,nfl_1,nfl_2,nhl_1,nhl_2,nhl_3
0,"Atlanta, Georgia",2024-02-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Atlanta, Georgia",2024-02-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Atlanta, Georgia",2024-02-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Atlanta, Georgia",2024-02-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Atlanta, Georgia",2024-02-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19407,"Winnipeg, Manitoba",2025-04-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19408,"Winnipeg, Manitoba",2025-04-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19409,"Winnipeg, Manitoba",2025-04-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19410,"Winnipeg, Manitoba",2025-04-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
# create column to calculate total teams in previous 5 days
# updated to include separate columns to fix UTP
city_date_game_count_df[team_cols] = city_date_game_count_df.groupby(['metro_area'])[team_cols]\
                                        .rolling(window=5, min_periods=1).sum().reset_index(0, drop=True)
city_date_game_count_df

Unnamed: 0,metro_area,date,mlb_1,mlb_2,mls_1,mls_2,nba_1,nba_2,nfl_1,nfl_2,nhl_1,nhl_2,nhl_3
0,"Atlanta, Georgia",2024-02-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Atlanta, Georgia",2024-02-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Atlanta, Georgia",2024-02-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Atlanta, Georgia",2024-02-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Atlanta, Georgia",2024-02-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19407,"Winnipeg, Manitoba",2025-04-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19408,"Winnipeg, Manitoba",2025-04-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19409,"Winnipeg, Manitoba",2025-04-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19410,"Winnipeg, Manitoba",2025-04-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [12]:
# create column for beginning of date range
city_date_game_count_df['date'] = pd.to_datetime(city_date_game_count_df['date']).dt.date
city_date_game_count_df['start_date'] = city_date_game_count_df['date'] - pd.Timedelta('4 days')
city_date_game_count_df = city_date_game_count_df.rename(columns={'date':'end_date'})
city_date_game_count_df = city_date_game_count_df[['metro_area', 'start_date', 'end_date']+team_cols]
city_date_game_count_df

Unnamed: 0,metro_area,start_date,end_date,mlb_1,mlb_2,mls_1,mls_2,nba_1,nba_2,nfl_1,nfl_2,nhl_1,nhl_2,nhl_3
0,"Atlanta, Georgia",2024-02-17,2024-02-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Atlanta, Georgia",2024-02-18,2024-02-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Atlanta, Georgia",2024-02-19,2024-02-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Atlanta, Georgia",2024-02-20,2024-02-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Atlanta, Georgia",2024-02-21,2024-02-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19407,"Winnipeg, Manitoba",2025-04-09,2025-04-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19408,"Winnipeg, Manitoba",2025-04-10,2025-04-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19409,"Winnipeg, Manitoba",2025-04-11,2025-04-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19410,"Winnipeg, Manitoba",2025-04-12,2025-04-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [13]:
# combine new columns into 1 new column for five day total
city_date_game_count_df['five_day_total'] = city_date_game_count_df[team_cols].clip(upper=1).sum(axis=1)
city_date_game_count_df

Unnamed: 0,metro_area,start_date,end_date,mlb_1,mlb_2,mls_1,mls_2,nba_1,nba_2,nfl_1,nfl_2,nhl_1,nhl_2,nhl_3,five_day_total
0,"Atlanta, Georgia",2024-02-17,2024-02-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Atlanta, Georgia",2024-02-18,2024-02-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Atlanta, Georgia",2024-02-19,2024-02-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Atlanta, Georgia",2024-02-20,2024-02-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Atlanta, Georgia",2024-02-21,2024-02-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19407,"Winnipeg, Manitoba",2025-04-09,2025-04-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
19408,"Winnipeg, Manitoba",2025-04-10,2025-04-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
19409,"Winnipeg, Manitoba",2025-04-11,2025-04-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
19410,"Winnipeg, Manitoba",2025-04-12,2025-04-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0


In [14]:
# testing top date ranges
city_date_game_count_df.sort_values('five_day_total', ascending=False)

Unnamed: 0,metro_area,start_date,end_date,mlb_1,mlb_2,mls_1,mls_2,nba_1,nba_2,nfl_1,nfl_2,nhl_1,nhl_2,nhl_3,five_day_total
11618,"New York City, New York",2024-09-28,2024-10-02,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,7.0
11616,"New York City, New York",2024-09-26,2024-09-30,0.0,4.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,7.0
11615,"New York City, New York",2024-09-25,2024-09-29,0.0,5.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,7.0
11610,"New York City, New York",2024-09-20,2024-09-24,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,6.0
8704,"Los Angeles, California",2024-11-07,2024-11-11,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,2.0,2.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8172,"Las Vegas, Nevada",2024-07-20,2024-07-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8173,"Las Vegas, Nevada",2024-07-21,2024-07-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8174,"Las Vegas, Nevada",2024-07-22,2024-07-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8175,"Las Vegas, Nevada",2024-07-23,2024-07-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Integrating with original games data to make 1 table for export

In [15]:
# filter down to just five day total column and then proceed as before
city_date_game_count_df = city_date_game_count_df.drop(columns=team_cols)
city_date_game_count_df.head()

Unnamed: 0,metro_area,start_date,end_date,five_day_total
0,"Atlanta, Georgia",2024-02-17,2024-02-21,0.0
1,"Atlanta, Georgia",2024-02-18,2024-02-22,0.0
2,"Atlanta, Georgia",2024-02-19,2024-02-23,0.0
3,"Atlanta, Georgia",2024-02-20,2024-02-24,0.0
4,"Atlanta, Georgia",2024-02-21,2024-02-25,0.0


In [16]:
games_df.head(1)

Unnamed: 0,league,game_id,date,time,home_id,home_name,away_id,away_name,venue_name,city,state,full_address,lat,lon,metro_area,metro_team_num
0,mlb,mlb_152753,2024-02-22,14:10,230,San Diego Padres,218,Los Angeles Dodgers,Petco Park,San Diego,California,"100 Park Boulevard, San Diego, CA 92101",32.707188,-117.156877,"San Diego, California",mlb_1


In [17]:
games_df['earliest_start_date'] = games_df['date'] - pd.Timedelta('4 days')
games_df['latest_end_date'] = games_df['date'] + pd.Timedelta('4 days')
games_df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df['earliest_start_date'] = games_df['date'] - pd.Timedelta('4 days')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df['latest_end_date'] = games_df['date'] + pd.Timedelta('4 days')


Unnamed: 0,league,game_id,date,time,home_id,home_name,away_id,away_name,venue_name,city,state,full_address,lat,lon,metro_area,metro_team_num,earliest_start_date,latest_end_date
0,mlb,mlb_152753,2024-02-22,14:10,230,San Diego Padres,218,Los Angeles Dodgers,Petco Park,San Diego,California,"100 Park Boulevard, San Diego, CA 92101",32.707188,-117.156877,"San Diego, California",mlb_1,2024-02-18,2024-02-26


In [18]:
full_join_df = city_date_game_count_df.merge(games_df, on='metro_area')
print(len(full_join_df))
full_join_df.head(1)

2263608


Unnamed: 0,metro_area,start_date,end_date,five_day_total,league,game_id,date,time,home_id,home_name,...,away_name,venue_name,city,state,full_address,lat,lon,metro_team_num,earliest_start_date,latest_end_date
0,"Atlanta, Georgia",2024-02-17,2024-02-21,0.0,mls,mls_1150785,2024-03-09,18:30:00,1608,Atlanta United FC,...,New England Revolution,Mercedes-Benz Stadium,Atlanta,Georgia,"35 Northside Dr NW, Atlanta, GA 30313",33.755447,-84.402094,mls_1,2024-03-05,2024-03-13


In [19]:
# filters df to associate each game with all date ranges that the game falls in
games_counts_df = full_join_df[(full_join_df['start_date']>=full_join_df['earliest_start_date']) &
                               (full_join_df['end_date']<=full_join_df['latest_end_date'])]\
                               .sort_values(['metro_area', 'date'])
print(len(games_counts_df))
games_counts_df.head(5)

26758


Unnamed: 0,metro_area,start_date,end_date,five_day_total,league,game_id,date,time,home_id,home_name,...,away_name,venue_name,city,state,full_address,lat,lon,metro_team_num,earliest_start_date,latest_end_date
969,"Atlanta, Georgia",2024-03-05,2024-03-09,1.0,mls,mls_1150785,2024-03-09,18:30:00,1608,Atlanta United FC,...,New England Revolution,Mercedes-Benz Stadium,Atlanta,Georgia,"35 Northside Dr NW, Atlanta, GA 30313",33.755447,-84.402094,mls_1,2024-03-05,2024-03-13
1026,"Atlanta, Georgia",2024-03-06,2024-03-10,1.0,mls,mls_1150785,2024-03-09,18:30:00,1608,Atlanta United FC,...,New England Revolution,Mercedes-Benz Stadium,Atlanta,Georgia,"35 Northside Dr NW, Atlanta, GA 30313",33.755447,-84.402094,mls_1,2024-03-05,2024-03-13
1083,"Atlanta, Georgia",2024-03-07,2024-03-11,1.0,mls,mls_1150785,2024-03-09,18:30:00,1608,Atlanta United FC,...,New England Revolution,Mercedes-Benz Stadium,Atlanta,Georgia,"35 Northside Dr NW, Atlanta, GA 30313",33.755447,-84.402094,mls_1,2024-03-05,2024-03-13
1140,"Atlanta, Georgia",2024-03-08,2024-03-12,1.0,mls,mls_1150785,2024-03-09,18:30:00,1608,Atlanta United FC,...,New England Revolution,Mercedes-Benz Stadium,Atlanta,Georgia,"35 Northside Dr NW, Atlanta, GA 30313",33.755447,-84.402094,mls_1,2024-03-05,2024-03-13
1197,"Atlanta, Georgia",2024-03-09,2024-03-13,1.0,mls,mls_1150785,2024-03-09,18:30:00,1608,Atlanta United FC,...,New England Revolution,Mercedes-Benz Stadium,Atlanta,Georgia,"35 Northside Dr NW, Atlanta, GA 30313",33.755447,-84.402094,mls_1,2024-03-05,2024-03-13


In [20]:
# exporting results to use in Tableau
games_counts_df.to_csv('tableau/games_counts_data.csv', index=False)

## Next Steps
1. UTP - fixed 9/29, but may want to move metro_team_num to database
2. Dates - fixed 9/29 in original API calls
3. Completed teams - fixed 9/30, will not involve the database side, only a separate xlsx file
4. New next step 9/30: dynamic N-days ranges?