In [57]:
import numpy as np
import pandas as pd
import sqlite3
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression



cnx = sqlite3.connect('database.sqlite')

player_data = pd.read_sql("SELECT id, player_api_id, player_name, birthday, height, weight FROM Player", cnx)
player_stats_data = pd.read_sql("SELECT id, player_api_id, date, overall_rating, potential, preferred_foot, attacking_work_rate, defensive_work_rate, crossing, finishing, heading_accuracy, short_passing, volleys, dribbling, curve, free_kick_accuracy, long_passing, ball_control, acceleration, sprint_speed, agility, reactions, shot_power, jumping, stamina, strength, long_shots, aggression, interceptions, positioning, vision, penalties, marking, standing_tackle, sliding_tackle, gk_diving, gk_handling, gk_kicking, gk_positioning, gk_reflexes FROM Player_Attributes", cnx)
match_data = pd.read_sql("SELECT id, country_id, league_id, season, stage, date, match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, home_player_1, home_player_2, home_player_3, home_player_4, home_player_5, home_player_6, home_player_7, home_player_8, home_player_9, home_player_10, home_player_11, away_player_1, away_player_2, away_player_3, away_player_4, away_player_5, away_player_6, away_player_7, away_player_8, away_player_9, away_player_10, away_player_11, goal, shoton, shotoff, foulcommit, card, cross, corner, possession, B365H, B365D, B365A FROM Match", cnx)
league_data = pd.read_sql("SELECT id, country_id, name FROM League", cnx)
country_data = pd.read_sql("SELECT id, name FROM Country", cnx)
team_data = pd.read_sql("SELECT id, team_api_id, team_long_name, team_short_name FROM Team", cnx)
team_attributes_data = pd.read_sql("SELECT id, team_api_id, date, buildUpPlaySpeed, buildUpPlaySpeedClass, buildUpPlayDribbling, buildUpPlayDribblingClass, buildUpPlayPassing, buildUpPlayPassingClass, buildUpPlayPositioningClass, chanceCreationPassing, chanceCreationPassingClass, chanceCreationCrossing, chanceCreationCrossingClass, chanceCreationShooting, chanceCreationShooting, chanceCreationPositioningClass, defencePressure, defencePressureClass, defenceAggression, defenceAggressionClass, defenceTeamWidth, defenceTeamWidthClass, defenceDefenderLineClass FROM Team_Attributes", cnx)


def null_counts( dataframe ):
    null_counts = dataframe.isnull().sum()
    null_percents = null_counts / len(dataframe) * 100
    print(null_percents)


null_counts(player_data)
null_counts(player_stats_data)
null_counts(match_data)
null_counts(league_data)
null_counts(country_data)
null_counts(team_data)
null_counts(team_attributes_data)

id               0.0
player_api_id    0.0
player_name      0.0
birthday         0.0
height           0.0
weight           0.0
dtype: float64
id                     0.000000
player_api_id          0.000000
date                   0.000000
overall_rating         0.454402
potential              0.454402
preferred_foot         0.454402
attacking_work_rate    1.755645
defensive_work_rate    0.454402
crossing               0.454402
finishing              0.454402
heading_accuracy       0.454402
short_passing          0.454402
volleys                1.474633
dribbling              0.454402
curve                  1.474633
free_kick_accuracy     0.454402
long_passing           0.454402
ball_control           0.454402
acceleration           0.454402
sprint_speed           0.454402
agility                1.474633
reactions              0.454402
shot_power             0.454402
jumping                1.474633
stamina                0.454402
strength               0.454402
long_shots             0.45

# Datenaufbereitung
In diesem Code-Abschnitt werden die Daten bereinigt und für das maschinelle Lernen aufbereitet. Dazu werden die im vorherigen Abschnitt gesammelten Erkenntnisse zu den Anteilen von Leeren Werten je Spalte dazu genutzt, Spalten mit mangelnder Aussagekraft vollständig zu entfernen (`drop()`). Daraufhin wird sich außerdem der verbliebenen leeren Werte in allen Zeilen angenommen.

## Spielerratings
Die Tabelle Player_Attributes (Dtaframe `player_stats_data`) enthält Spiellerratings für alle Spieler zu verschiedenen Zeitpunkten. Dabei haben manche Spieler mehrere Ratings pro Jahr, andere nur ein Rating pro Jahr, wieder andere nur ein Rating in mehreren Jahren. Um diese unterschiedlichen Vorraussetzungen für alle Spieler auf ein Niveau zu bringen, werden mehrere unterjährige Ratings auf ein einzelnes gemittelt. Im folgenden wird dann für jeden Spieler das Rating aus dem benötigten Jahr verwendet. Gibt es kein Rating aus diesem Jahr, wird das naheliegenste Rating herangezogen.

In [58]:
match_data = match_data.drop(columns=["goal", "shoton", "shotoff", "foulcommit", "card", "cross", "corner", "possession"], axis=1)
team_attributes_data = team_attributes_data.drop("buildUpPlayDribbling", axis=1)

df_dropped = match_data.dropna()
player_stats_data = player_stats_data.dropna()



In [60]:
# Select the player columns
player_cols = ['home_player_1', 'home_player_2', 'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6', 'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11']

# Combine the columns horizontally
combined_players = pd.concat([match_data[col] for col in player_cols])

# Remove duplicates
unique_players = combined_players.drop_duplicates()
print(unique_players[0])

# Check if every entry in the Series is in column A of the DataFrame
mask = unique_players.isin(player_data['player_api_id'])

# Replace the entries not in column A with a specified value
unique_players[~mask] = -1

# find the indices where the value is -1
indices = unique_players.index[unique_players == -1]

# count the number of occurrences of each unique value in the series
counts = unique_players.value_counts()

# print the indices & print the number of occurrences of -1
print(counts[-1])
print(indices)


nan
1
Int64Index([0], dtype='int64')


Hier muss noch die Zusammenführung der Daten geschehen in ein Dataframe df

correlation

In [None]:
# Korrelaitonsmatrix erstellt und in corr_matrix gespeichert => enthält Korrleationskoeffizienten zwischen allen möglichen Paaren
corr_matrix = df.corr()

# stack() wandelt Matrix in Series um, bei der jedes Element ein Paar von Spaltennamen und der Korrelationskoeffizient zwischen diesen Spalten ist
# [abs(corr_matrix) > 0.9] reduziert die Werte auf nur Werte mit einem Wert über 0.9
high_corr_pairs = corr_matrix.stack()[abs(corr_matrix) > 0.9]

# Indizes der Serie werden zu Spaltennamen des Dataframe 
high_corr_pairs = high_corr_pairs.reset_index()

# Spaltennamen des DataFrames umbenannt für bessere Lesbarkeit
high_corr_pairs.columns = ['Column 1', 'Column 2', 'Correlation']

print(high_corr_pairs)

Chi Squared

In [None]:
# erstelle ein Dataframe für die Ergebnis von Chi2
chi2_results_df = pd.DataFrame(columns=['Column 1', 'Column 2', 'Chi-Squared', 'P-Value'])

# iteriert über jede Spalte des Dataframes
for col in df.columns:
    comparison_col = col

    # iteriert über jede Spalte des Dataframes
    for col in df.columns:

        # überspringt den Schritt, wenn die Spalten die gleichen sind
        if col == comparison_col:
            continue

        # erstellt eine Kontingenztabelle
        contingency_table = pd.crosstab(df[comparison_col], df[col])

        # führt chi2 aus und printed das Ergebnis
        chi2, pval, dof, expected = chi2_contingency(contingency_table)
                    #print(f"Chi-squared test for columns '{comparison_col}' and '{col}': chi2={chi2}, pval={pval}")
        results_df = results_df.append({'Column 1': comparison_col, 'Column 2': col, 'Chi-Squared': chi2, 'P-Value': pval}, ignore_index=True)

print(results_df)


TODO: Was machen wir mit dem Ergebnis

#Wrapper Methode
Sucht die beste Untergruppe von Features für ein Mdoell, indem eine Modell-basierte Bewertung der Feature-Untergruppen durchführen. 

TODO: die Zielvariable whoWon muss noch definiert werden

In [None]:
# Trennung von Features und Zielvariablen
features = df.drop('whoWon', axis=1)
target = df['whoWon']

# Definition des Modells
model = LinearRegression()

# Erstellung des RFE-Objekts mit 10 gewünschten Features
rfe = RFE(model, n_features_to_select=10)

# Anpassung des RFE-Objekts an die Daten
rfe.fit(features, target)

# Ausgabe der ausgewählten Features
selected_features = features.columns[rfe.support_]
print(selected_features)

#Random Forest

In [None]:
# Trennung von Features und Zielvariablen
features = df.drop('whoWon', axis=1)
target = df['whoWon']

# Aufteilen der Daten in Trainings- und Testdaten
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Definition des Random-Forest-Modells
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Training des Modells auf den Trainingsdaten
rf.fit(features_train, target_train)

# Vorhersage auf den Testdaten
target_pred = rf.predict(features_test)

# Berechnung der Genauigkeit
accuracy = accuracy_score(target_test, target_pred)
print(f"Genauigkeit: {accuracy}")

#Logistische Regression

In [None]:
# Trennung von Features und Zielvariablen
features = df.drop('whoWon', axis=1)
target = df['whoWon']

# Aufteilen der Daten in Trainings- und Testdaten
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Definition des logistischen Regressionsmodells
lr = LogisticRegression()

# Training des Modells auf den Trainingsdaten
lr.fit(features_train, target_train)

# Vorhersage auf den Testdaten
target_pred = lr.predict(features_test)

# Berechnung der Genauigkeit
accuracy = accuracy_score(target_test, target_pred)
print(f"Genauigkeit: {accuracy}")