In [1]:
import pandas as pd
import os

In [2]:
# load the combined data
combined_data = pd.read_csv('Data/data_v1/grizzlys_combined_schedule_sales_v2.csv')

In [3]:
# print the first few rows to verify
print(combined_data.head())

   spieltag        date  weekday   time           home_team  \
0         1  2022-09-16   Friday  19.30  Grizzlys Wolfsburg   
1         4  2022-09-25   Sunday  16.30  Grizzlys Wolfsburg   
2         5  2022-09-27  Tuesday  19.30  Grizzlys Wolfsburg   
3         9  2022-10-09   Sunday  14.00  Grizzlys Wolfsburg   
4        11  2022-10-16   Sunday  19.00  Grizzlys Wolfsburg   

                 away_team  distance season             datetime  \
0          Löwen Frankfurt     369.0  22-23  2022-09-16 19:30:00   
1      Nürnberg Ice Tigers     463.0  22-23  2022-09-25 16:30:00   
2       Augsburger Panther     588.0  22-23  2022-09-27 19:30:00   
3     EHC Red Bull München     600.0  22-23  2022-10-09 14:00:00   
4  Schwenninger Wild Wings     638.0  22-23  2022-10-16 19:00:00   

   ticket_count  gross_revenue  season_tickets_sold  season_tickets_revenue  \
0        1509.0        24121.0                  978                328504.0   
1        1473.0        15696.0                  978   

In [4]:
# print column data types
print(combined_data.dtypes)

spieltag                    int64
date                       object
weekday                    object
time                       object
home_team                  object
away_team                  object
distance                  float64
season                     object
datetime                   object
ticket_count              float64
gross_revenue             float64
season_tickets_sold         int64
season_tickets_revenue    float64
total_season_tickets        int64
total_season_revenue      float64
dtype: object


In [5]:
# check for missing values
print(combined_data.isnull().sum())

spieltag                  0
date                      0
weekday                   0
time                      0
home_team                 0
away_team                 0
distance                  4
season                    0
datetime                  0
ticket_count              1
gross_revenue             1
season_tickets_sold       0
season_tickets_revenue    0
total_season_tickets      0
total_season_revenue      0
dtype: int64


In [6]:
# show rows with missing values
print(combined_data[combined_data.isnull().any(axis=1)])

    spieltag        date   weekday   time           home_team  \
7         17  2022-10-30    Sunday  16.30  Grizzlys Wolfsburg   
11        26  2022-12-02    Friday  19.30  Grizzlys Wolfsburg   
36        19  2023-11-16  Thursday   18.0  Grizzlys Wolfsburg   
45        39  2024-01-19    Friday   19.3  Grizzlys Wolfsburg   
50        45  2024-02-20   Tuesday   19.3  Grizzlys Wolfsburg   

                 away_team  distance season             datetime  \
7   SC Bietigheim Steelers       NaN  22-23  2022-10-30 16:30:00   
11  SC Bietigheim Steelers       NaN  22-23  2022-12-02 19:30:00   
36            Schwenningen       NaN  23-24  2023-11-16 18:00:00   
45            Schwenningen       NaN  23-24  2024-01-19 19:03:00   
50         Düsseldorfer EG     367.0  23-24  2024-02-20 19:03:00   

    ticket_count  gross_revenue  season_tickets_sold  season_tickets_revenue  \
7         1707.0        22918.0                  978                328504.0   
11        1078.0        17960.0         

In [7]:
# show unique values in away_team column
print(combined_data['away_team'].unique())

['Löwen Frankfurt' 'Nürnberg Ice Tigers' 'Augsburger Panther'
 'EHC Red Bull München' 'Schwenninger Wild Wings' 'Pinguins Bremerhaven'
 'SC Bietigheim Steelers' 'Eisbären Berlin' 'Iserlohn Roosters'
 'Adler Mannheim' 'Kölner Haie' 'ERC Ingolstadt' 'Straubing Tigers'
 'Düsseldorfer EG' 'Schwenningen']


In [10]:
# replace "Schwenningen" with "Schwenninger Wild Wings" and update the distance accordingly

combined_data.loc[combined_data['away_team'] == 'Schwenningen', 'away_team'] = 'Schwenninger Wild Wings'

# get the distnace from other row where away_team is Schwenninger Wild Wings
distance_value = combined_data.loc[combined_data['away_team'] == 'Schwenninger Wild Wings', 'distance'].iloc[0]
print(f"Distance value to be used: {distance_value}")
# update the missing distance values
combined_data.loc[combined_data['away_team'] == 'Schwenninger Wild Wings', 'distance'] = distance_value

# verify the changes
print(combined_data[combined_data['away_team'] == 'Schwenninger Wild Wings'])

Distance value to be used: 638.0
    spieltag        date   weekday   time           home_team  \
4         11  2022-10-16    Sunday  19.00  Grizzlys Wolfsburg   
13        30  2022-12-13   Tuesday  19.30  Grizzlys Wolfsburg   
36        19  2023-11-16  Thursday   18.0  Grizzlys Wolfsburg   
45        39  2024-01-19    Friday   19.3  Grizzlys Wolfsburg   
55         6  2024-10-04    Friday  19.30  Grizzlys Wolfsburg   
59        17  2024-11-17    Sunday  14.00  Grizzlys Wolfsburg   

                  away_team  distance season             datetime  \
4   Schwenninger Wild Wings     638.0  22-23  2022-10-16 19:00:00   
13  Schwenninger Wild Wings     638.0  22-23  2022-12-13 19:30:00   
36  Schwenninger Wild Wings     638.0  23-24  2023-11-16 18:00:00   
45  Schwenninger Wild Wings     638.0  23-24  2024-01-19 19:03:00   
55  Schwenninger Wild Wings     638.0  24-25  2024-10-04 19:30:00   
59  Schwenninger Wild Wings     638.0  24-25  2024-11-17 14:00:00   

    ticket_count  gross_rev

In [11]:
# show rows with missing values
print(combined_data[combined_data.isnull().any(axis=1)])

    spieltag        date  weekday   time           home_team  \
7         17  2022-10-30   Sunday  16.30  Grizzlys Wolfsburg   
11        26  2022-12-02   Friday  19.30  Grizzlys Wolfsburg   
50        45  2024-02-20  Tuesday   19.3  Grizzlys Wolfsburg   

                 away_team  distance season             datetime  \
7   SC Bietigheim Steelers       NaN  22-23  2022-10-30 16:30:00   
11  SC Bietigheim Steelers       NaN  22-23  2022-12-02 19:30:00   
50         Düsseldorfer EG     367.0  23-24  2024-02-20 19:03:00   

    ticket_count  gross_revenue  season_tickets_sold  season_tickets_revenue  \
7         1707.0        22918.0                  978                328504.0   
11        1078.0        17960.0                  978                328504.0   
50           NaN            NaN                 1093                372400.0   

    total_season_tickets  total_season_revenue  
7                  71796            1640176.90  
11                 71796            1640176.90  
50

In [12]:
# update the distance for away_team SC Bietigheim Steelers to 517 km
combined_data.loc[combined_data['away_team'] == 'SC Bietigheim Steelers', 'distance'] = 517

# verify the changes
print(combined_data[combined_data['away_team'] == 'SC Bietigheim Steelers'])

    spieltag        date weekday   time           home_team  \
7         17  2022-10-30  Sunday  16.30  Grizzlys Wolfsburg   
11        26  2022-12-02  Friday  19.30  Grizzlys Wolfsburg   

                 away_team  distance season             datetime  \
7   SC Bietigheim Steelers     517.0  22-23  2022-10-30 16:30:00   
11  SC Bietigheim Steelers     517.0  22-23  2022-12-02 19:30:00   

    ticket_count  gross_revenue  season_tickets_sold  season_tickets_revenue  \
7         1707.0        22918.0                  978                328504.0   
11        1078.0        17960.0                  978                328504.0   

    total_season_tickets  total_season_revenue  
7                  71796             1640176.9  
11                 71796             1640176.9  


In [13]:
# show rows with missing values
print(combined_data[combined_data.isnull().any(axis=1)])

    spieltag        date  weekday  time           home_team        away_team  \
50        45  2024-02-20  Tuesday  19.3  Grizzlys Wolfsburg  Düsseldorfer EG   

    distance season             datetime  ticket_count  gross_revenue  \
50     367.0  23-24  2024-02-20 19:03:00           NaN            NaN   

    season_tickets_sold  season_tickets_revenue  total_season_tickets  \
50                 1093                372400.0                 56215   

    total_season_revenue  
50            1568768.82  


In [14]:
# drop the rows with missing values
combined_data = combined_data.dropna()

# verify no missing values remain
print(combined_data.isnull().sum())

spieltag                  0
date                      0
weekday                   0
time                      0
home_team                 0
away_team                 0
distance                  0
season                    0
datetime                  0
ticket_count              0
gross_revenue             0
season_tickets_sold       0
season_tickets_revenue    0
total_season_tickets      0
total_season_revenue      0
dtype: int64


In [15]:
# save the final cleaned data
output_path = 'Data/data_v1/grizzlys_combined_schedule_sales_v3.csv'
combined_data.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")

Cleaned data saved to Data/data_v1/grizzlys_combined_schedule_sales_v3.csv


In [16]:
# show unique values in time column
print(combined_data['time'].unique())

['19.30' '16.30' '14.00' '19.00' '19.3' '16.3' '14.0' '19.0' '18.0'
 '18.00' '13.00' '18.00 VW']
