In [30]:
import re
import pandas as pd

In [31]:
file_path = '/Users/youri/VSC Data/Data Analytics/Fussballdaten/Premierlegue_data_15_23.xlsx'
df = pd.read_excel(file_path)

# Anzeigen der ersten paar Zeilen der Tabelle zur Überprüfung
df.head()

Unnamed: 0,matchday_raw,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw
0,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:4,,
1,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:3,,
2,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:2,,
3,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,3:2,,
4,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,2:2,,


In [32]:
def clean_data(df):
    """
    Bereinigt den DataFrame, indem Zeilen entfernt werden, in denen alle angegebenen Spalten leer sind.

    :param df: Der zu bereinigende DataFrame.
    :return: Der bereinigte DataFrame.
    """

    columns_to_check = ['final_score_raw', 'goal_raw', 'goal_time_raw', 'extratime_raw']
    # Entfernen von Zeilen, in denen alle spezifizierten Spalten leer sind
    df_cleaned = df.dropna(subset=columns_to_check, how='all')
    return df_cleaned

# Beispiel für die Verwendung der Funktion
df_cleaned = clean_data(df)
df_cleaned.head()


Unnamed: 0,matchday_raw,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw
0,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:4,,
1,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:3,,
2,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:2,,
3,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,3:2,,
4,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,2:2,,


In [33]:
# Funktionen zur Extraktion von Spieltag und Saison
def extract_spieltag(text):
    match = re.search(r'(\d+)\. Spieltag', text)
    return match.group(1) if match else None

def extract_saison(text):
    match = re.search(r'\b(\d{4}/\d{4})\b', text)
    return match.group(1) if match else None

# Test der Funktionen mit einem Beispieltext
example_text = "Bundesliga 2022/2023 - 34. Spieltag - Sa., 27...."

test_spieltag = extract_spieltag(example_text)
test_saison = extract_saison(example_text)

test_spieltag, test_saison

('34', '2022/2023')

In [34]:
# Erstellen eines DataFrame
df_example = pd.DataFrame(df_cleaned)

# Anwenden der Funktionen zur Erstellung der neuen Spalten
df_example['League'] = 'Premierlegue'
df_example['Spieltag'] = df_example['matchday_raw'].apply(extract_spieltag)
df_example['Saison'] = df_example['matchday_raw'].apply(extract_saison)

# Anzeigen der ersten paar Zeilen des aktualisierten DataFrame
df_example.head()

Unnamed: 0,matchday_raw,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw,League,Spieltag,Saison
0,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:4,,,Premierlegue,38,2022/2023
1,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:3,,,Premierlegue,38,2022/2023
2,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,4:2,,,Premierlegue,38,2022/2023
3,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,3:2,,,Premierlegue,38,2022/2023
4,"Premier League 2022/2023 - 38. Spieltag - So.,...",FC Southampton,FC Liverpool,,2:2,,,Premierlegue,38,2022/2023


In [35]:
def merge_goal_times(df):
    """
    Updates the extratime_raw column in the DataFrame to contain 0 for 'NO' or empty values, 
    and 1 for 'YES', ensuring that the values from goaltime and extratime of the same row stay together.

    :param df: DataFrame to be processed.
    :return: Processed DataFrame.
    """
    # Group data by matches
    grouped = df.groupby(['matchday_raw', 'hometeam_raw', 'awayteam_raw'])

    # List to store processed data
    merged_data = []

    # Iterate over each group (match)
    for _, group in grouped:
        # Update extratime_raw values: 'YES' -> 1, 'NO' or NaN -> 0
        group['extratime_raw'] = group['extratime_raw'].apply(lambda x: 1 if x == 'YES' else 0)

        # Add the processed data to the list
        merged_data.append(group)

    # Merge all processed matches back into a DataFrame
    return pd.concat(merged_data)

# Example of using the function
# df_example = pd.DataFrame(...)  # Replace with your DataFrame
df_cleaned = merge_goal_times(df_example)
df_cleaned.head(10)


Unnamed: 0,matchday_raw,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw,League,Spieltag,Saison
19235,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:3,,0,Premierlegue,1,2014/2015
19236,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:2,,0,Premierlegue,1,2014/2015
19237,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:1,,0,Premierlegue,1,2014/2015
19238,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:0,,0,Premierlegue,1,2014/2015
19239,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,,34',0,Premierlegue,1,2014/2015
19240,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,,21',0,Premierlegue,1,2014/2015
19241,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,,17',0,Premierlegue,1,2014/2015
19242,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,,14',0,Premierlegue,1,2014/2015
19284,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,,1:2,,0,Premierlegue,1,2014/2015
19285,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,,1:1,,0,Premierlegue,1,2014/2015


In [36]:
def merge_goal_times(df):
    """
    Fügt die Torzeiten in die gleiche Zeile wie die Tore ein und übernimmt die Daten aus der Spalte 'extratime_raw',
    ohne Daten zu löschen, wenn es mehr Torzeiten als Tore gibt.

    :param df: Der zu bearbeitende DataFrame.
    :return: Der bearbeitete DataFrame.
    """
    # Gruppieren der Daten nach Spielen
    grouped = df.groupby(['matchday_raw', 'hometeam_raw', 'awayteam_raw'])

    # Listen zur Speicherung der bearbeiteten Daten
    merged_data = []

    # Durchlaufen jeder Gruppe (jedes Spiels)
    for _, group in grouped:
        # Extrahieren der Tore, Torzeiten und Extrazeiten
        goals = group['goal_raw'].dropna().tolist()
        times = group['goal_time_raw'].dropna().tolist()
        extratimes = group['extratime_raw'].dropna().tolist()

        # Sicherstellen, dass die Länge der Listen gleich ist
        max_length = max(len(goals), len(times), len(extratimes))
        goals += [None] * (max_length - len(goals))
        times += [None] * (max_length - len(times))
        extratimes += [None] * (max_length - len(extratimes))

        # Erstellen einer neuen DataFrame für das aktuelle Spiel
        game_data = group.iloc[:max_length].copy()
        game_data['goal_raw'] = goals
        game_data['goal_time_raw'] = times
        game_data['extratime_raw'] = extratimes

        # Hinzufügen der bearbeiteten Daten zur Liste
        merged_data.append(game_data)

    # Zusammenführen aller bearbeiteten Spiele zurück in einen DataFrame
    return pd.concat(merged_data)

# Beispiel für die Verwendung der Funktion
# df_example = pd.DataFrame(...)  # Ersetzen Sie dies durch Ihren DataFrame
df_cleaned = merge_goal_times(df_example)
df_cleaned.head(20)


Unnamed: 0,matchday_raw,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw,League,Spieltag,Saison
19235,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:3,34',0.0,Premierlegue,1,2014/2015
19236,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:2,21',0.0,Premierlegue,1,2014/2015
19237,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:1,17',0.0,Premierlegue,1,2014/2015
19238,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,,1:0,14',0.0,Premierlegue,1,2014/2015
19284,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,,1:2,72',0.0,Premierlegue,1,2014/2015
19285,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,,1:1,53',0.0,Premierlegue,1,2014/2015
19286,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,,0:1,28',0.0,Premierlegue,1,2014/2015
19276,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Leicester City,FC Everton,,2:2,86',0.0,Premierlegue,1,2014/2015
19277,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Leicester City,FC Everton,,1:2,45',0.0,Premierlegue,1,2014/2015
19278,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Leicester City,FC Everton,,1:1,22',0.0,Premierlegue,1,2014/2015


In [37]:
def update_final_score(df):
    """
    Updates the final_score_raw based on the latest goal in each match.
    Matches without goals receive a final_score_raw of '0:0'.

    :param df: DataFrame to be processed.
    :return: Updated DataFrame with the new final_score_raw.
    """
    # Group data by matches
    grouped = df.groupby(['matchday_raw', 'hometeam_raw', 'awayteam_raw'])

    # List to store processed data
    updated_data = []

    # Iterate over each group (match)
    for _, group in grouped:
        # Extract goals and goal times, dropping NaN values
        goals = group['goal_raw'].dropna().tolist()
        times = group['goal_time_raw'].dropna().tolist()

        # Determine the final score of the match
        if goals and times:
            # The last goal (based on time) determines the final score
            final_score = sorted(zip(times, goals))[-1][1]
        else:
            # No goals scored, so the score is '0:0'
            final_score = '0:0'

        # Update final_score_raw for the current match
        game_data = group.copy()
        game_data['final_score_raw'] = final_score

        # Add the processed data to the list
        updated_data.append(game_data)

    # Merge all processed matches back into a DataFrame
    return pd.concat(updated_data)

# Apply the function to your DataFrame
df_updated = update_final_score(df_cleaned)
df_updated.head(10)


Unnamed: 0,matchday_raw,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw,League,Spieltag,Saison
19235,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:3,34',0.0,Premierlegue,1,2014/2015
19236,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:2,21',0.0,Premierlegue,1,2014/2015
19237,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:1,17',0.0,Premierlegue,1,2014/2015
19238,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:0,14',0.0,Premierlegue,1,2014/2015
19284,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,1:2,1:2,72',0.0,Premierlegue,1,2014/2015
19285,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,1:2,1:1,53',0.0,Premierlegue,1,2014/2015
19286,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,1:2,0:1,28',0.0,Premierlegue,1,2014/2015
19276,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Leicester City,FC Everton,2:2,2:2,86',0.0,Premierlegue,1,2014/2015
19277,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Leicester City,FC Everton,2:2,1:2,45',0.0,Premierlegue,1,2014/2015
19278,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Leicester City,FC Everton,2:2,1:1,22',0.0,Premierlegue,1,2014/2015


In [38]:
def remove_empty_goals(df):
    """
    Entfernt alle Einträge, bei denen 'goal_raw' leer ist.

    :param df: Der zu bearbeitende DataFrame.
    :return: Der bearbeitete DataFrame ohne leere 'goal_raw' Einträge.
    """
    # Entfernen von Zeilen, in denen 'goal_raw' leer ist
    df_cleaned = df.dropna(subset=['goal_raw'])
    return df_cleaned

# Anwenden der Funktion auf Ihren DataFrame
df_updated = remove_empty_goals(df_updated)
df_updated.head()


Unnamed: 0,matchday_raw,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw,League,Spieltag,Saison
19235,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:3,34',0.0,Premierlegue,1,2014/2015
19236,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:2,21',0.0,Premierlegue,1,2014/2015
19237,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:1,17',0.0,Premierlegue,1,2014/2015
19238,"Premier League 2014/2015 - 1. Spieltag - Mo., ...",FC Burnley,FC Chelsea,1:3,1:0,14',0.0,Premierlegue,1,2014/2015
19284,"Premier League 2014/2015 - 1. Spieltag - Sa., ...",Manchester United,Swansea City,1:2,1:2,72',0.0,Premierlegue,1,2014/2015


In [39]:
def rearrange_columns(df):
    """
    Entfernt die Spalte 'matchday_raw' und ändert die Reihenfolge der Spalten.

    :param df: Der zu bearbeitende DataFrame.
    :return: Der bearbeitete DataFrame mit der neuen Spaltenreihenfolge.
    """
    # Entfernen der Spalte 'matchday_raw'
    df = df.drop(columns=['matchday_raw'])

    # Festlegen der neuen Spaltenreihenfolge
    new_order = ['League', 'Saison', 'Spieltag', 'hometeam_raw', 'awayteam_raw',
                 'final_score_raw', 'goal_raw', 'goal_time_raw', 'extratime_raw']

    # Anordnen der Spalten in der neuen Reihenfolge
    df = df[new_order]
    return df

# Anwenden der Funktion auf Ihren DataFrame
df_ord = rearrange_columns(df_updated)
df_ord.head()

Unnamed: 0,League,Saison,Spieltag,hometeam_raw,awayteam_raw,final_score_raw,goal_raw,goal_time_raw,extratime_raw
19235,Premierlegue,2014/2015,1,FC Burnley,FC Chelsea,1:3,1:3,34',0.0
19236,Premierlegue,2014/2015,1,FC Burnley,FC Chelsea,1:3,1:2,21',0.0
19237,Premierlegue,2014/2015,1,FC Burnley,FC Chelsea,1:3,1:1,17',0.0
19238,Premierlegue,2014/2015,1,FC Burnley,FC Chelsea,1:3,1:0,14',0.0
19284,Premierlegue,2014/2015,1,Manchester United,Swansea City,1:2,1:2,72',0.0


In [None]:
def save_to_excel(df, path="/Users/youri/VSC Data/Data Analytics/Premierlegue_Bereinigt.xlsx"):
    """
    Speichert den übergebenen DataFrame in einer Excel-Datei im angegebenen Verzeichnis.

    :param df: Der DataFrame, der gespeichert werden soll.
    :param path: Der vollständige Pfad und Name der zu erstellenden Excel-Datei.
    """
    try:
        # Speichern des DataFrame in einer Excel-Datei
        df.to_excel(path, index=False)
        return f"Datei '{path}' wurde erfolgreich gespeichert."
    except Exception as e:
        return f"Ein Fehler ist aufgetreten: {e}"

# Speichern des df_example DataFrame in einer Excel-Datei
save_to_excel(df_ord)

In [1]:
# Path to the Excel file
excel_path = '/Users/youri/VSC Data/Data Analytics/Fussballdaten/Premierlegue/Premierlegue_Bereinigt.xlsx'  # Update this to your Excel file path

# Read the Excel file
df = pd.read_excel(excel_path)

# Path where you want to save the CSV file
csv_path = '/Users/youri/VSC Data/Data Analytics/Fussballdaten/Premierlegue/Premierlegue_Bereinigt.csv'  # Update this to your desired CSV file path

# Write the DataFrame to a CSV file
df.to_csv(csv_path, index=False)