# Data Preprocessing - PDF to CSV Extraction

Extract game schedule tables from PDF files for Grizzlys Wolfsburg seasons.

In [14]:
# Install required library
# pip install pdfplumber pandas

In [15]:
import pdfplumber
import pandas as pd
import os

In [16]:
# Extract tables from PDFs - using appropriate method for each
import re

base_path = "Data/data_v1"

# 1. Extract 22-23 season using text extraction
pdf_path = os.path.join(base_path, "22-23/Spielplan 22-23_GW.pdf")
csv_path = os.path.join(base_path, "22-23/spielplan_22_23.csv")
print("Processing: 22-23/Spielplan 22-23_GW.pdf")

with pdfplumber.open(pdf_path) as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text() + "\n"

# Parse lines: pattern like "1. 1 Fri, Sep 16, 2022 19.30 Grizzlys Wolfsburg (GER) Löwen Frankfurt (GER)"
pattern_22_23 = r'(\d+)\.\s+(\d+)\s+(\w+,\s+\w+\s+\d+,\s+\d+)\s+(\d+\.\d+|tba)\s+(.+?)\s+\(GER\)\s+(.+?)\s+\(GER\)'
matches = re.findall(pattern_22_23, text)

data_22_23 = []
for m in matches:
    data_22_23.append({
        '#': m[0],
        'ST': m[1],
        'Date': m[2],
        'Bully': m[3],
        'Home': m[4] + ' (GER)',
        'Away': m[5] + ' (GER)'
    })

df_22_23 = pd.DataFrame(data_22_23)
df_22_23.to_csv(csv_path, index=False)
print(f"  Saved: {csv_path} ({len(df_22_23)} rows)")
print(f"  Columns: {list(df_22_23.columns)}\n")

# 2. Extract 23-24 season using text extraction
pdf_path = os.path.join(base_path, "23-24/GW_Spielplan_23-24.pdf")
csv_path = os.path.join(base_path, "23-24/spielplan_23_24.csv")
print("Processing: 23-24/GW_Spielplan_23-24.pdf")

with pdfplumber.open(pdf_path) as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text() + "\n"

# Parse lines: pattern like "1 Freitag, 15.09.23 19.30 Uhr Grizzlys vs. Augsburg"
pattern_23_24 = r'(\d+)\s+(\w+,\s+\d+\.\d+\.\d+)\s+(\d+\.\d+)\s+Uhr\s+(.+?)(?=\n\d+\s+\w+,|\n*$|Deutschland|Länderspiel)'
matches = re.findall(pattern_23_24, text, re.DOTALL)

data_23_24 = []
for m in matches:
    data_23_24.append({
        'SPIELTAG': m[0],
        'DATUM': m[1],
        'UHRZEIT': m[2] + ' Uhr',
        'SPIELPAARUNG': m[3].strip()
    })

df_23_24 = pd.DataFrame(data_23_24)
df_23_24.to_csv(csv_path, index=False)
print(f"  Saved: {csv_path} ({len(df_23_24)} rows)")
print(f"  Columns: {list(df_23_24.columns)}\n")

# 3. Extract 24-25 season using table extraction (works correctly)
pdf_path = os.path.join(base_path, "24-25/Spielplan GW 24-25 Final-1.pdf")
csv_path = os.path.join(base_path, "24-25/spielplan_24_25.csv")
print("Processing: 24-25/Spielplan GW 24-25 Final-1.pdf")

with pdfplumber.open(pdf_path) as pdf:
    all_rows = []
    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            if table:
                all_rows.extend(table)

# Skip title row, use row 1 as header
headers = all_rows[1]
data_rows = [row for row in all_rows[2:] if row[0] and row[0] not in ['#', '']]

df_24_25 = pd.DataFrame(data_rows, columns=headers)
df_24_25 = df_24_25.dropna(how='all')
df_24_25.to_csv(csv_path, index=False)
print(f"  Saved: {csv_path} ({len(df_24_25)} rows)")
print(f"  Columns: {list(df_24_25.columns)}\n")

print("Extraction complete!")

Processing: 22-23/Spielplan 22-23_GW.pdf
  Saved: Data/data_v1/22-23/spielplan_22_23.csv (49 rows)
  Columns: ['#', 'ST', 'Date', 'Bully', 'Home', 'Away']

Processing: 23-24/GW_Spielplan_23-24.pdf
  Saved: Data/data_v1/23-24/spielplan_23_24.csv (52 rows)
  Columns: ['SPIELTAG', 'DATUM', 'UHRZEIT', 'SPIELPAARUNG']

Processing: 24-25/Spielplan GW 24-25 Final-1.pdf
  Saved: Data/data_v1/24-25/spielplan_24_25.csv (52 rows)
  Columns: ['#', 'Date', 'Weekday', 'Face-Off', 'Home', 'Away', 'Distance']

Extraction complete!


In [17]:
# Display extracted data for verification
print("=" * 50)
print("Verification - First 5 rows of each file:")
print("=" * 50)

csv_files = [
    "22-23/spielplan_22_23.csv",
    "23-24/spielplan_23_24.csv",
    "24-25/spielplan_24_25.csv",
]

for csv_file in csv_files:
    csv_path = os.path.join(base_path, csv_file)
    df = pd.read_csv(csv_path)
    print(f"\n{csv_file}:")
    display(df.head())

Verification - First 5 rows of each file:

22-23/spielplan_22_23.csv:


Unnamed: 0,#,ST,Date,Bully,Home,Away
0,1,1,"Fri, Sep 16, 2022",19.3,Grizzlys Wolfsburg (GER),Löwen Frankfurt (GER)
1,2,2,"Wed, Sep 21, 2022",19.3,Kölner Haie (GER),Grizzlys Wolfsburg (GER)
2,3,3,"Fri, Sep 23, 2022",19.3,Eisbären Berlin (GER),Grizzlys Wolfsburg (GER)
3,4,4,"Sun, Sep 25, 2022",16.3,Grizzlys Wolfsburg (GER),Nürnberg Ice Tigers (GER)
4,5,5,"Tue, Sep 27, 2022",19.3,Grizzlys Wolfsburg (GER),Augsburger Panther (GER)



23-24/spielplan_23_24.csv:


Unnamed: 0,SPIELTAG,DATUM,UHRZEIT,SPIELPAARUNG
0,1,"Freitag, 15.09.23",19.30 Uhr,Grizzlys vs. Augsburg
1,2,"Sonntag, 17.09.23",14.00 Uhr,Bremerhaven vs. Grizzlys
2,3,"Freitag, 22.09.23",19.30 Uhr,Grizzlys vs. Straubing
3,4,"Sonntag, 24.09.23",19.00 Uhr,Schwenningen vs. Grizzlys
4,5,"Freitag, 29.09.23",19.30 Uhr,Ingolstadt vs. Grizzlys



24-25/spielplan_24_25.csv:


Unnamed: 0,#,Date,Weekday,Face-Off,Home,Away,Distance
0,1,20.09.2024,Fri,19.3,Pinguins Bremerhaven (GER),Grizzlys Wolfsburg (GER),252 km (D)
1,2,22.09.2024,Sun,16.3,Grizzlys Wolfsburg (GER),Düsseldorfer EG (GER),367 km
2,3,27.09.2024,Fri,19.3,Grizzlys Wolfsburg (GER),Iserlohn Roosters (GER),304 km
3,4,29.09.2024,Sun,14.0,Löwen Frankfurt (GER),Grizzlys Wolfsburg (GER),369 km
4,5,02.10.2024,Wed,19.3,EHC Red Bull München (GER),Grizzlys Wolfsburg (GER),600 km


In [18]:
# Standardize all CSV files to uniform schema
from datetime import datetime

base_path = "Data/data_v1"

# German to English weekday mapping
german_weekdays = {
    'Montag': 'Monday', 'Dienstag': 'Tuesday', 'Mittwoch': 'Wednesday',
    'Donnerstag': 'Thursday', 'Freitag': 'Friday', 'Samstag': 'Saturday', 'Sonntag': 'Sunday'
}

# Short to full weekday mapping
short_weekdays = {
    'Mon': 'Monday', 'Tue': 'Tuesday', 'Wed': 'Wednesday',
    'Thu': 'Thursday', 'Fri': 'Friday', 'Sat': 'Saturday', 'Sun': 'Sunday'
}

# 1. Standardize 22-23
print("Standardizing 22-23...")
df = pd.read_csv(os.path.join(base_path, "22-23/spielplan_22_23.csv"))

df_std_22_23 = pd.DataFrame()
df_std_22_23['spieltag'] = df['ST']

# Parse date like "Fri, Sep 16, 2022"
df_std_22_23['date'] = pd.to_datetime(df['Date'], format='%a, %b %d, %Y')
df_std_22_23['weekday'] = df_std_22_23['date'].dt.day_name()
df_std_22_23['time'] = df['Bully']
df_std_22_23['home_team'] = df['Home']
df_std_22_23['away_team'] = df['Away']
df_std_22_23['distance'] = None
df_std_22_23['season'] = '22-23'

df_std_22_23.to_csv(os.path.join(base_path, "22-23/spielplan_22_23_std.csv"), index=False)
print(f"  Saved: 22-23/spielplan_22_23_std.csv ({len(df_std_22_23)} rows)")

# 2. Standardize 23-24
print("Standardizing 23-24...")
df = pd.read_csv(os.path.join(base_path, "23-24/spielplan_23_24.csv"))

df_std_23_24 = pd.DataFrame()
df_std_23_24['spieltag'] = df['SPIELTAG']

# Parse date like "Freitag, 15.09.23" - extract just the date part
def parse_german_date(datum):
    # Split "Freitag, 15.09.23" into weekday and date
    parts = datum.split(', ')
    if len(parts) == 2:
        date_str = parts[1]
        return pd.to_datetime(date_str, format='%d.%m.%y')
    return None

df_std_23_24['date'] = df['DATUM'].apply(parse_german_date)
df_std_23_24['weekday'] = df_std_23_24['date'].dt.day_name()

# Extract time from "19.30 Uhr"
df_std_23_24['time'] = df['UHRZEIT'].str.replace(' Uhr', '')

# Split "Grizzlys vs. Augsburg" into home and away
def split_matchup(matchup):
    parts = matchup.split(' vs. ')
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return matchup, None

home_away = df['SPIELPAARUNG'].apply(split_matchup)
df_std_23_24['home_team'] = home_away.apply(lambda x: x[0])
df_std_23_24['away_team'] = home_away.apply(lambda x: x[1])
df_std_23_24['distance'] = None
df_std_23_24['season'] = '23-24'

df_std_23_24.to_csv(os.path.join(base_path, "23-24/spielplan_23_24_std.csv"), index=False)
print(f"  Saved: 23-24/spielplan_23_24_std.csv ({len(df_std_23_24)} rows)")

# 3. Standardize 24-25
print("Standardizing 24-25...")
df = pd.read_csv(os.path.join(base_path, "24-25/spielplan_24_25.csv"))

df_std_24_25 = pd.DataFrame()
df_std_24_25['spieltag'] = df['#']

# Parse date like "20.09.2024"
df_std_24_25['date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')

# Convert short weekday to full
df_std_24_25['weekday'] = df['Weekday'].map(short_weekdays)
df_std_24_25['time'] = df['Face-Off']
df_std_24_25['home_team'] = df['Home']
df_std_24_25['away_team'] = df['Away']
df_std_24_25['distance'] = df['Distance']
df_std_24_25['season'] = '24-25'

df_std_24_25.to_csv(os.path.join(base_path, "24-25/spielplan_24_25_std.csv"), index=False)
print(f"  Saved: 24-25/spielplan_24_25_std.csv ({len(df_std_24_25)} rows)")

print("\nStandardization complete!")

Standardizing 22-23...
  Saved: 22-23/spielplan_22_23_std.csv (49 rows)
Standardizing 23-24...
  Saved: 23-24/spielplan_23_24_std.csv (52 rows)
Standardizing 24-25...
  Saved: 24-25/spielplan_24_25_std.csv (52 rows)

Standardization complete!


In [19]:
# Combine all seasons into one CSV
print("Combining all seasons...")

# Read standardized files
df_22_23 = pd.read_csv(os.path.join(base_path, "22-23/spielplan_22_23_std.csv"))
df_23_24 = pd.read_csv(os.path.join(base_path, "23-24/spielplan_23_24_std.csv"))
df_24_25 = pd.read_csv(os.path.join(base_path, "24-25/spielplan_24_25_std.csv"))

# Combine all dataframes
df_combined = pd.concat([df_22_23, df_23_24, df_24_25], ignore_index=True)

# Sort by date
df_combined['date'] = pd.to_datetime(df_combined['date'])
df_combined = df_combined.sort_values('date').reset_index(drop=True)

# Save combined file
combined_path = os.path.join(base_path, "spielplan_combined.csv")
df_combined.to_csv(combined_path, index=False)

print(f"Saved: {combined_path}")
print(f"Total rows: {len(df_combined)}")
print(f"  - 22-23: {len(df_22_23)} games")
print(f"  - 23-24: {len(df_23_24)} games")
print(f"  - 24-25: {len(df_24_25)} games")
print(f"\nColumns: {list(df_combined.columns)}")
print("\nFirst 10 rows:")
display(df_combined.head(10))

Combining all seasons...
Saved: Data/data_v1/spielplan_combined.csv
Total rows: 153
  - 22-23: 49 games
  - 23-24: 52 games
  - 24-25: 52 games

Columns: ['spieltag', 'date', 'weekday', 'time', 'home_team', 'away_team', 'distance', 'season']

First 10 rows:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg (GER),Löwen Frankfurt (GER),,22-23
1,2,2022-09-21,Wednesday,19.3,Kölner Haie (GER),Grizzlys Wolfsburg (GER),,22-23
2,3,2022-09-23,Friday,19.3,Eisbären Berlin (GER),Grizzlys Wolfsburg (GER),,22-23
3,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg (GER),Nürnberg Ice Tigers (GER),,22-23
4,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg (GER),Augsburger Panther (GER),,22-23
5,7,2022-10-02,Sunday,14.0,Iserlohn Roosters (GER),Grizzlys Wolfsburg (GER),,22-23
6,8,2022-10-07,Friday,19.3,Pinguins Bremerhaven (GER)Grizzlys Wolfsburg (...,"8. 9 Sun, Oct 09, 2022 14.00 Grizzlys Wolfsbur...",,22-23
7,10,2022-10-14,Friday,19.3,Straubing Tigers (GER),Grizzlys Wolfsburg (GER),,22-23
8,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg (GER),Schwenninger Wild Wings (GER),,22-23
9,13,2022-10-20,Thursday,19.3,Augsburger Panther (GER),Grizzlys Wolfsburg (GER),,22-23


In [20]:
# did some manual cleaning in the combined data csv file(Data/data_v1/spielplan_combined.csv) after this step

In [21]:
# load the  cleaned combined csv file and display first 10 rows for verification
df_final = pd.read_csv(combined_path)
print("\nFinal Combined Data - First 10 rows:")
display(df_final.head(10))


Final Combined Data - First 10 rows:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg (GER),Löwen Frankfurt (GER),,22-23
1,2,2022-09-21,Wednesday,19.3,Kölner Haie (GER),Grizzlys Wolfsburg (GER),,22-23
2,3,2022-09-23,Friday,19.3,Eisbären Berlin (GER),Grizzlys Wolfsburg (GER),,22-23
3,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg (GER),Nürnberg Ice Tigers (GER),,22-23
4,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg (GER),Augsburger Panther (GER),,22-23
5,7,2022-10-02,Sunday,14.0,Iserlohn Roosters (GER),Grizzlys Wolfsburg (GER),,22-23
6,8,2022-10-07,Friday,19.3,Pinguins Bremerhaven (GER),Grizzlys Wolfsburg (GER),,22-23
7,9,2022-10-09,Sunday,14.0,Grizzlys Wolfsburg (GER),EHC Red Bull München (GER),,22-23
8,10,2022-10-14,Friday,19.3,Straubing Tigers (GER),Grizzlys Wolfsburg (GER),,22-23
9,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg (GER),Schwenninger Wild Wings (GER),,22-23


In [22]:
# see the total unique home teams
unique_home_teams = df_final['home_team'].unique()
print(f"\nTotal Unique Home Teams: {len(unique_home_teams)}")
print("Home Teams:")
for team in unique_home_teams:
    print(f" - {team}")


Total Unique Home Teams: 29
Home Teams:
 - Grizzlys Wolfsburg (GER)
 - Kölner Haie (GER)
 - Eisbären Berlin (GER)
 - Iserlohn Roosters (GER)
 - Pinguins Bremerhaven (GER)
 - Straubing Tigers (GER)
 - Augsburger Panther (GER)
 - Düsseldorfer EG (GER)
 - Nürnberg Ice Tigers (GER)
 - Adler Mannheim (GER)
 - ERC Ingolstadt (GER)
 - Löwen Frankfurt (GER)
 - EHC Red Bull München (GER)
 - Schwenninger Wild Wings (GER)
 - SC Bietigheim Steelers (GER)
 - Grizzlys
 - Bremerhaven
 - Schwenningen
 - Ingolstadt
 - Düsseldorf
 - München
 - Nürnberg
 - Berlin
 - Mannheim
 - Köln
 - Straubing
 - Frankfurt
 - Iserlohn
 - Augsburg


In [23]:
# remove (GER) from team names
df_final['home_team'] = df_final['home_team'].str.replace(' (GER)', '', regex=False)
df_final['away_team'] = df_final['away_team'].str.replace(' (GER)', '', regex=False)

# preview final cleaned data
print("\nFinal Cleaned Data - First 10 rows after removing (GER):")
display(df_final.head(10))


Final Cleaned Data - First 10 rows after removing (GER):


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,,22-23
1,2,2022-09-21,Wednesday,19.3,Kölner Haie,Grizzlys Wolfsburg,,22-23
2,3,2022-09-23,Friday,19.3,Eisbären Berlin,Grizzlys Wolfsburg,,22-23
3,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,,22-23
4,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg,Augsburger Panther,,22-23
5,7,2022-10-02,Sunday,14.0,Iserlohn Roosters,Grizzlys Wolfsburg,,22-23
6,8,2022-10-07,Friday,19.3,Pinguins Bremerhaven,Grizzlys Wolfsburg,,22-23
7,9,2022-10-09,Sunday,14.0,Grizzlys Wolfsburg,EHC Red Bull München,,22-23
8,10,2022-10-14,Friday,19.3,Straubing Tigers,Grizzlys Wolfsburg,,22-23
9,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg,Schwenninger Wild Wings,,22-23


In [25]:
# similarly see the unique away teams
unique_away_teams = df_final['away_team'].unique()
print(f"\nTotal Unique Away Teams: {len(unique_away_teams)}")
print("Away Teams:")
for team in unique_away_teams:
    print(f" - {team}")


Total Unique Away Teams: 30
Away Teams:
 - Löwen Frankfurt
 - Grizzlys Wolfsburg
 - Nürnberg Ice Tigers
 - Augsburger Panther
 -  EHC Red Bull München
 - Schwenninger Wild Wings
 - Pinguins Bremerhaven
 - SC Bietigheim Steelers
 - Eisbären Berlin
 - Iserlohn Roosters
 - EHC Red Bull München
 - Adler Mannheim
 - Kölner Haie
 - ERC Ingolstadt
 - Straubing Tigers
 - Düsseldorfer EG
 - Augsburg
 - Grizzlys
 - Straubing
 - Iserlohn
 - Mannheim
 - Berlin
 - Frankfurt
 - Köln
 - Schwenningen
 - Bremerhaven
 - Nürnberg
 - Ingolstadt
 - München
 - Düsseldorf


In [26]:
# map teams name to their full names (Ex: Grizzlys - Grizzlys Wolfsburg, Bremerhaven -> Pinguins Bremerhaven,  etc.)
team_name_mapping = {
    'Grizzlys': 'Grizzlys Wolfsburg',
    'Bremerhaven': 'Pinguins Bremerhaven',
    'Augsburg': 'Augsburger Panther',
    'Nürnberg': 'Nürnberg Ice Tigers',
    'München': 'EHC Red Bull München',
    'Düsseldorf': 'Düsseldorfer EG',
    'Ingolstadt': 'ERC Ingolstadt',
    'Straubing': 'Straubing Tigers',
    'Köln': 'Kölner Haie',
    'Schwenninger': 'Schwenninger Wild Wings',
    'Bietigheim Steelers': 'SC Bietigheim Steelers',
    'Berlin': 'Eisbären Berlin',
    'Frankfurt': 'Löwen Frankfurt',
    'Mannheim': 'Adler Mannheim',
    'Iserlohn': 'Iserlohn Roosters',
    # Add more mappings as needed
}

df_final['home_team'] = df_final['home_team'].replace(team_name_mapping)
df_final['away_team'] = df_final['away_team'].replace(team_name_mapping)

# preview final cleaned data after mapping team names
print("\nFinal Cleaned Data - First 10 rows after mapping team names:")
display(df_final.head(10))


Final Cleaned Data - First 10 rows after mapping team names:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,,22-23
1,2,2022-09-21,Wednesday,19.3,Kölner Haie,Grizzlys Wolfsburg,,22-23
2,3,2022-09-23,Friday,19.3,Eisbären Berlin,Grizzlys Wolfsburg,,22-23
3,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,,22-23
4,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg,Augsburger Panther,,22-23
5,7,2022-10-02,Sunday,14.0,Iserlohn Roosters,Grizzlys Wolfsburg,,22-23
6,8,2022-10-07,Friday,19.3,Pinguins Bremerhaven,Grizzlys Wolfsburg,,22-23
7,9,2022-10-09,Sunday,14.0,Grizzlys Wolfsburg,EHC Red Bull München,,22-23
8,10,2022-10-14,Friday,19.3,Straubing Tigers,Grizzlys Wolfsburg,,22-23
9,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg,Schwenninger Wild Wings,,22-23


In [27]:
# unique home teams after mapping
unique_home_teams_mapped = df_final['home_team'].unique()
print(f"\nTotal Unique Home Teams after mapping: {len(unique_home_teams_mapped)}")
print("Home Teams after mapping:")
for team in unique_home_teams_mapped:
    print(f" - {team}")


Total Unique Home Teams after mapping: 16
Home Teams after mapping:
 - Grizzlys Wolfsburg
 - Kölner Haie
 - Eisbären Berlin
 - Iserlohn Roosters
 - Pinguins Bremerhaven
 - Straubing Tigers
 - Augsburger Panther
 - Düsseldorfer EG
 - Nürnberg Ice Tigers
 - Adler Mannheim
 - ERC Ingolstadt
 - Löwen Frankfurt
 - EHC Red Bull München
 - Schwenninger Wild Wings
 - SC Bietigheim Steelers
 - Schwenningen


In [28]:
# unique away teams after mapping
unique_away_teams_mapped = df_final['away_team'].unique()
print(f"\nTotal Unique Away Teams after mapping: {len(unique_away_teams_mapped)}")
print("Away Teams after mapping:")
for team in unique_away_teams_mapped:
    print(f" - {team}")


Total Unique Away Teams after mapping: 17
Away Teams after mapping:
 - Löwen Frankfurt
 - Grizzlys Wolfsburg
 - Nürnberg Ice Tigers
 - Augsburger Panther
 -  EHC Red Bull München
 - Schwenninger Wild Wings
 - Pinguins Bremerhaven
 - SC Bietigheim Steelers
 - Eisbären Berlin
 - Iserlohn Roosters
 - EHC Red Bull München
 - Adler Mannheim
 - Kölner Haie
 - ERC Ingolstadt
 - Straubing Tigers
 - Düsseldorfer EG
 - Schwenningen


In [29]:
# remove leading and trailing spaces from team names
df_final['home_team'] = df_final['home_team'].str.strip()
df_final['away_team'] = df_final['away_team'].str.strip()

# print cleaned home and away team names
print("\nCleaned Home and Away Team Names:")
print("Home Teams after cleaning:")
for team in df_final['home_team'].unique():
    print(f" - {team}")
print("Away Teams after cleaning:")
for team in df_final['away_team'].unique():
    print(f" - {team}")


Cleaned Home and Away Team Names:
Home Teams after cleaning:
 - Grizzlys Wolfsburg
 - Kölner Haie
 - Eisbären Berlin
 - Iserlohn Roosters
 - Pinguins Bremerhaven
 - Straubing Tigers
 - Augsburger Panther
 - Düsseldorfer EG
 - Nürnberg Ice Tigers
 - Adler Mannheim
 - ERC Ingolstadt
 - Löwen Frankfurt
 - EHC Red Bull München
 - Schwenninger Wild Wings
 - SC Bietigheim Steelers
 - Schwenningen
Away Teams after cleaning:
 - Löwen Frankfurt
 - Grizzlys Wolfsburg
 - Nürnberg Ice Tigers
 - Augsburger Panther
 - EHC Red Bull München
 - Schwenninger Wild Wings
 - Pinguins Bremerhaven
 - SC Bietigheim Steelers
 - Eisbären Berlin
 - Iserlohn Roosters
 - Adler Mannheim
 - Kölner Haie
 - ERC Ingolstadt
 - Straubing Tigers
 - Düsseldorfer EG
 - Schwenningen


In [30]:
#  check if home and away team names are consistent
inconsistent_teams = set(df_final['home_team']).symmetric_difference(set(df_final['away_team']))
if inconsistent_teams:
    print("\nInconsistent team names found between home and away teams:")
    for team in inconsistent_teams:
        print(f" - {team}")
else:
    print("\nAll team names are consistent between home and away teams.")


All team names are consistent between home and away teams.


In [31]:
# remove all rows where home team is not "Grizzlys Wolfsburg"and save the final cleaned data to a new csv file
df_final_filtered = df_final[df_final['home_team'] == 'Grizzlys Wolfsburg'].reset_index(drop=True)

# print final cleaned data
print("\nFinal Cleaned Data - First 10 rows for Grizzlys Wolfsburg home games:")
display(df_final_filtered.head(10))

final_cleaned_path = os.path.join(base_path, "spielplan_Grizzlys_only_final_cleaned.csv")
df_final_filtered.to_csv(final_cleaned_path, index=False)
print(f"\nFinal cleaned data saved to: {final_cleaned_path}")


Final Cleaned Data - First 10 rows for Grizzlys Wolfsburg home games:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,,22-23
1,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,,22-23
2,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg,Augsburger Panther,,22-23
3,9,2022-10-09,Sunday,14.0,Grizzlys Wolfsburg,EHC Red Bull München,,22-23
4,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg,Schwenninger Wild Wings,,22-23
5,12,2022-10-18,Wednesday,19.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,,22-23
6,14,2022-10-23,Sunday,14.0,Grizzlys Wolfsburg,Pinguins Bremerhaven,,22-23
7,17,2022-10-30,Sunday,16.3,Grizzlys Wolfsburg,SC Bietigheim Steelers,,22-23
8,19,2022-11-04,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,,22-23
9,21,2022-11-18,Friday,19.3,Grizzlys Wolfsburg,Eisbären Berlin,,22-23



Final cleaned data saved to: Data/data_v1/spielplan_Grizzlys_only_final_cleaned.csv


In [35]:
# load the final cleaned data for grizzlys only
df_grizzlys = pd.read_csv(final_cleaned_path)

In [36]:
print("\nFinal Grizzlys Wolfsburg Home Games Data - Last 10 rows:")    
display(df_grizzlys.tail(10))


Final Grizzlys Wolfsburg Home Games Data - Last 10 rows:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
68,34,2025-01-05,Sunday,16.30,Grizzlys Wolfsburg,Straubing Tigers,560 km,24-25
69,36,2025-01-12,Sunday,14.00,Grizzlys Wolfsburg,Pinguins Bremerhaven,252 km (D),24-25
70,38,2025-01-19,Sunday,16.30,Grizzlys Wolfsburg,Augsburger Panther,588 km,24-25
71,40,2025-01-26,Sunday,19.00,Grizzlys Wolfsburg,EHC Red Bull München,600 km,24-25
72,41,2025-01-28,Tuesday,19.30,Grizzlys Wolfsburg,Löwen Frankfurt,369 km,24-25
73,43,2025-02-02,Sunday,16.30,Grizzlys Wolfsburg,Iserlohn Roosters,304 km,24-25
74,45,2025-02-16,Sunday,14.00,Grizzlys Wolfsburg,Pinguins Bremerhaven,252 km (D),24-25
75,47,2025-02-21,Friday,18.00 VW,Grizzlys Wolfsburg,ERC Ingolstadt,526 km,24-25
76,49,2025-02-28,Friday,19.30,Grizzlys Wolfsburg,Kölner Haie,376 km,24-25
77,51,2025-03-04,Tuesday,19.30,Grizzlys Wolfsburg,Straubing Tigers,560 km,24-25


In [37]:
# for the distance column, update distance from text type to numeric type(560 km  -> 560)

# Convert distance from text to numeric (e.g., "560 km" -> 560)
df_grizzlys['distance'] = df_grizzlys['distance'].str.extract(r'(\d+)').astype(float)

# preview final grizzlys data with numeric distance
print("\nFinal Grizzlys Wolfsburg Home Games Data - Last 10 rows:")    
display(df_grizzlys.tail(10))


Final Grizzlys Wolfsburg Home Games Data - Last 10 rows:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
68,34,2025-01-05,Sunday,16.30,Grizzlys Wolfsburg,Straubing Tigers,560.0,24-25
69,36,2025-01-12,Sunday,14.00,Grizzlys Wolfsburg,Pinguins Bremerhaven,252.0,24-25
70,38,2025-01-19,Sunday,16.30,Grizzlys Wolfsburg,Augsburger Panther,588.0,24-25
71,40,2025-01-26,Sunday,19.00,Grizzlys Wolfsburg,EHC Red Bull München,600.0,24-25
72,41,2025-01-28,Tuesday,19.30,Grizzlys Wolfsburg,Löwen Frankfurt,369.0,24-25
73,43,2025-02-02,Sunday,16.30,Grizzlys Wolfsburg,Iserlohn Roosters,304.0,24-25
74,45,2025-02-16,Sunday,14.00,Grizzlys Wolfsburg,Pinguins Bremerhaven,252.0,24-25
75,47,2025-02-21,Friday,18.00 VW,Grizzlys Wolfsburg,ERC Ingolstadt,526.0,24-25
76,49,2025-02-28,Friday,19.30,Grizzlys Wolfsburg,Kölner Haie,376.0,24-25
77,51,2025-03-04,Tuesday,19.30,Grizzlys Wolfsburg,Straubing Tigers,560.0,24-25


In [38]:
# Create mapping of away_team to distance from 24-25 season
distance_map = df_grizzlys[df_grizzlys['season'] == '24-25'].groupby('away_team')['distance'].first().to_dict()

# print distance mapping for verification
print("\nDistance Mapping from 24-25 season:")
for team, dist in distance_map.items():
    print(f" - {team}: {dist} km")


Distance Mapping from 24-25 season:
 - Adler Mannheim: 442.0 km
 - Augsburger Panther: 588.0 km
 - Düsseldorfer EG: 367.0 km
 - EHC Red Bull München: 600.0 km
 - ERC Ingolstadt: 526.0 km
 - Eisbären Berlin: 228.0 km
 - Iserlohn Roosters: 304.0 km
 - Kölner Haie: 376.0 km
 - Löwen Frankfurt: 369.0 km
 - Nürnberg Ice Tigers: 463.0 km
 - Pinguins Bremerhaven: 252.0 km
 - Schwenninger Wild Wings: 638.0 km
 - Straubing Tigers: 560.0 km


In [39]:
# Fill NaN distances using the above mapping
df_grizzlys['distance'] = df_grizzlys.apply(
    lambda row: distance_map.get(row['away_team'], row['distance']) 
    if pd.isna(row['distance']) else row['distance'],
    axis=1
)

# preview final grizzlys data after filling distances
print("\nFinal Grizzlys Wolfsburg Home Games Data after filling distances - First 10 rows:")    
display(df_grizzlys.head(10))


Final Grizzlys Wolfsburg Home Games Data after filling distances - First 10 rows:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,369.0,22-23
1,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,463.0,22-23
2,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg,Augsburger Panther,588.0,22-23
3,9,2022-10-09,Sunday,14.0,Grizzlys Wolfsburg,EHC Red Bull München,600.0,22-23
4,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg,Schwenninger Wild Wings,638.0,22-23
5,12,2022-10-18,Wednesday,19.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,463.0,22-23
6,14,2022-10-23,Sunday,14.0,Grizzlys Wolfsburg,Pinguins Bremerhaven,252.0,22-23
7,17,2022-10-30,Sunday,16.3,Grizzlys Wolfsburg,SC Bietigheim Steelers,,22-23
8,19,2022-11-04,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,369.0,22-23
9,21,2022-11-18,Friday,19.3,Grizzlys Wolfsburg,Eisbären Berlin,228.0,22-23


In [40]:
# save the final cleaned data with filled distances
final_grizzlys_path = os.path.join(base_path, "spielplan_Grizzlys_only_final_cleaned_v2.csv")
df_grizzlys.to_csv(final_grizzlys_path, index=False)
print(f"\nFinal Grizzlys data with filled distances saved to {final_grizzlys_path}")


Final Grizzlys data with filled distances saved to Data/data_v1/spielplan_Grizzlys_only_final_cleaned_v2.csv


In [41]:
# see the datatype of each column in the final grizzlys data
print("\nData Types in Final Grizzlys Data:")
print(df_grizzlys.dtypes)


Data Types in Final Grizzlys Data:
spieltag       int64
date          object
weekday       object
time          object
home_team     object
away_team     object
distance     float64
season        object
dtype: object


In [48]:
# create a column for date in datetime format by parsing the existing date column and time column
time_parts = df_grizzlys['time'].str.extract(r'^(\d{1,2})\.(\d{1,2})')
df_grizzlys['datetime'] = pd.to_datetime(
    df_grizzlys['date'].str.strip() + ' ' + 
    time_parts[0].str.zfill(2) + ':' + time_parts[1].str.zfill(2)
)

# preview final grizzlys data after adding date_time column
print("\nFinal Grizzlys Wolfsburg Home Games Data after adding date_time column - First 10 rows:")    
display(df_grizzlys.head(10))


Final Grizzlys Wolfsburg Home Games Data after adding date_time column - First 10 rows:


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season,date_time,datetime
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,369.0,22-23,NaT,2022-09-16 19:30:00
1,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,463.0,22-23,NaT,2022-09-25 16:30:00
2,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg,Augsburger Panther,588.0,22-23,NaT,2022-09-27 19:30:00
3,9,2022-10-09,Sunday,14.0,Grizzlys Wolfsburg,EHC Red Bull München,600.0,22-23,NaT,2022-10-09 14:00:00
4,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg,Schwenninger Wild Wings,638.0,22-23,NaT,2022-10-16 19:00:00
5,12,2022-10-18,Wednesday,19.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,463.0,22-23,NaT,2022-10-18 19:30:00
6,14,2022-10-23,Sunday,14.0,Grizzlys Wolfsburg,Pinguins Bremerhaven,252.0,22-23,NaT,2022-10-23 14:00:00
7,17,2022-10-30,Sunday,16.3,Grizzlys Wolfsburg,SC Bietigheim Steelers,,22-23,NaT,2022-10-30 16:30:00
8,19,2022-11-04,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,369.0,22-23,NaT,2022-11-04 19:30:00
9,21,2022-11-18,Friday,19.3,Grizzlys Wolfsburg,Eisbären Berlin,228.0,22-23,NaT,2022-11-18 19:30:00


In [49]:
# remove column date_time 
df_grizzlys = df_grizzlys.drop(columns=['date_time'])

# save the final cleaned data with filled distances
final_grizzlys_path = os.path.join(base_path, "spielplan_Grizzlys_only_final_cleaned_v3.sv")
df_grizzlys.to_csv(final_grizzlys_path, index=False)
print(f"\nFinal Grizzlys data with filled distances saved to {final_grizzlys_path}")


Final Grizzlys data with filled distances saved to Data/data_v1/spielplan_Grizzlys_only_final_cleaned_v3.sv
