In [14]:
import pandas as pd

# Load your dataset
data_path = '2020.csv'  # Update with your actual data path
df = pd.read_csv(data_path)

# Inspect the first few rows of the dataset
df.head()


Unnamed: 0,home_team,away_team,home_team_code,away_team_code,home_score,away_score,home_penalty,away_penalty,home_score_total,away_score_total,...,penalties_missed,penalties,red_cards,game_referees,stadium_city,stadium_name,stadium_name_media,stadium_name_official,stadium_name_event,stadium_name_sponsor
0,Italy,England,ITA,ENG,1.0,1.0,3.0,2.0,1.0,1.0,...,,"[{'phase': 'PENALTY', 'time': {}, 'internation...",,"[{'name': 'Myrsini Psarropoulou', 'role': 'UEF...",London,Wembley Stadium,Wembley Stadium,Wembley Stadium,Wembley Stadium,Wembley Stadium
1,England,Denmark,ENG,DEN,1.0,1.0,,,2.0,1.0,...,"[{'phase': 'EXTRA_TIME_FIRST_HALF', 'time': {'...",,,"[{'name': 'Hessel Steegstra', 'role': 'ASSISTA...",London,Wembley Stadium,Wembley Stadium,Wembley Stadium,Wembley Stadium,Wembley Stadium
2,Italy,Spain,ITA,ESP,1.0,1.0,4.0,2.0,1.0,1.0,...,,"[{'phase': 'PENALTY', 'time': {}, 'internation...",,"[{'name': 'Christian Dingert', 'role': 'ASSIST...",London,Wembley Stadium,Wembley Stadium,Wembley Stadium,Wembley Stadium,Wembley Stadium
3,Ukraine,England,UKR,ENG,0.0,4.0,,,0.0,4.0,...,,,,"[{'name': 'Mark Borsch', 'role': 'ASSISTANT_RE...",Rome,Olimpico in Rome,Stadio Olimpico,Stadio Olimpico,Olimpico in Rome,Stadio Olimpico
4,Czechia,Denmark,CZE,DEN,1.0,2.0,,,1.0,2.0,...,,,,"[{'name': 'Massimiliano Irrati', 'role': 'ASSI...",Baku,Baku Olympic Stadium,Baku Olympic Stadium,Baku Olympic Stadium,Baku Olympic Stadium,Baku Olympic Stadium


In [15]:
# Function to inspect and remove non-numeric values from specified columns
def inspect_and_clean_non_numeric(df, columns):
    for column in columns:
        non_numeric_values = df[pd.to_numeric(df[column], errors='coerce').isna()][column].unique()
        if len(non_numeric_values) > 0:
            print(f"Non-numeric values found in column {column}: {non_numeric_values}")
            # Convert to numeric and remove rows with NaN
            df[column] = pd.to_numeric(df[column], errors='coerce')
    # Drop rows with NaN values
    df.dropna(inplace=True)
    return df

# Columns to inspect and clean
numeric_columns = ['stadium_capacity', 'year', 'stadium_longitude', 'stadium_latitude', 'home_score', 'away_score']

# Clean the dataframe
df = inspect_and_clean_non_numeric(df, numeric_columns)

# Verify the dataframe after cleaning
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   home_team              0 non-null      object 
 1   away_team              0 non-null      object 
 2   home_team_code         0 non-null      object 
 3   away_team_code         0 non-null      object 
 4   home_score             0 non-null      float64
 5   away_score             0 non-null      float64
 6   home_penalty           0 non-null      float64
 7   away_penalty           0 non-null      float64
 8   home_score_total       0 non-null      float64
 9   away_score_total       0 non-null      float64
 10  winner                 0 non-null      object 
 11  winner_reason          0 non-null      object 
 12  year                   0 non-null      int64  
 13  date                   0 non-null      object 
 14  date_time              0 non-null      object 
 15  utc_offset_hours       

In [17]:
# Select features and target
features = ['stadium_longitude', 'stadium_latitude', 'stadium_capacity', 'year', 'home_team', 'away_team', 'condition_weather', 'stadium_name']
target = 'winner'

# Prepare the dataset
X = df[features]
y = df[target]

# Convert categorical features to numerical values
X = pd.get_dummies(X, columns=['home_team', 'away_team', 'condition_weather', 'stadium_name'], drop_first=True)

# Normalize 'stadium_capacity' and 'year'
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[['stadium_capacity', 'year']] = scaler.fit_transform(X[['stadium_capacity', 'year']])

# Verify the feature preparation
X.head()


ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by StandardScaler.

In [11]:
# Split data into train and test sets
split_index = int(0.7 * len(df))
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# Verify the split
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 35
Test set size: 16


In [12]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Initialize the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define the parameter grid for GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)

# Fit model
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")


TypeError: '<' not supported between instances of 'str' and 'float'

In [13]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Plot results of GridSearchCV
results = grid_search.cv_results_
results_df = pd.DataFrame(results)

# Plot heatmap of parameter importance
sns.heatmap(results_df.corr(), annot=True, cmap='coolwarm')
plt.title('Parameter Correlation Heatmap')
plt.show()

# Optional: Plot feature importances
importances = best_model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 8))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align='center')
plt.xticks(range(X.shape[1]), feature_names[indices], rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()
