In [49]:
import csv
import os
from datetime import date
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [50]:
# Output file for processed data
output_file = 'redacted_file.csv'
# Columns to keep in the new DataFrame
columns_to_keep = ['Date', 'HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'LBH', 'LBD', 'LBA', 'FTR', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC']

# Folder path for CSV files
data_folder_path = r'C:\Users\marku\Documents\GitHub\Odds\Odds\data'


def process_csv_files(data_folder_path, output_file, columns_to_keep):

    with open(output_file, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(columns_to_keep)  # Write column names at the top
        
        for filename in os.listdir(data_folder_path):
            if filename.endswith(".csv"):
                file_path = os.path.join(data_folder_path, filename)
                
                with open(file_path, 'r', encoding='utf-8', errors='replace') as csv_file:
                    csv_reader = csv.reader(csv_file)
                    header = next(csv_reader)  # Get the header
                    
                    # Identify the indices of the columns to keep
                    column_indices = [header.index(col) if col in header else None for col in columns_to_keep]
                    
                    for row in csv_reader:
                        # Handle missing values and replace invalid characters
                        for idx, value in enumerate(row):
                            if value == '�':
                                row[idx] = ''
  
                        # Select only the desired columns using identified indices
                        selected_row = [row[idx] if idx is not None else '' for idx in column_indices]
                        
                        # Handle missing values
                        for col_bet365, col_ladbrokes in zip(['B365H', 'B365D', 'B365A'], ['LBH', 'LBD', 'LBA']):
                            idx_bet365 = columns_to_keep.index(col_bet365)
                            idx_ladbrokes = columns_to_keep.index(col_ladbrokes)
                            
                            if selected_row[idx_bet365] == '':
                                selected_row[idx_bet365] = selected_row[idx_ladbrokes]
                       
                        # Append the selected row to the output file
                        writer.writerow(selected_row)

    print(f"Processed data saved to '{output_file}'.")

# Call the function to process CSV files
process_csv_files(data_folder_path, output_file, columns_to_keep)

df = pd.read_csv(output_file)

def process_csv_files2(df):
    # Remove LBH, LBD and LBA columns (not needed)
    df = df.drop(['LBH', 'LBD', 'LBA'], axis=1)

    # Create a column showing shot accuracy (shots on target / total shots, as percentage)
    df['HomeShotsAcc'] = (df['HST'] / df['HS']).astype(float).map("{:.2%}".format)

    # Shot accuracy for away team (as percentage)
    df['AwayShotsAcc'] = (df['AST'] / df['AS']).astype(float).map("{:.2%}".format)

    # Remove rows with missing values
    df = df.dropna()

    return df

df = process_csv_files2(df)

Processed data saved to 'redacted_file.csv'.


In [51]:
# Compute simple aggregates on the odds, goal difference, goal totals, shots, and shots on target for each team in the Premier League
def stats1 (df):
    print(df[['B365H']].describe())
    print('')
    print(df[['B365D']].describe())
    print('')
    print(df[['B365A']].describe())
    print('')
    return df
    
#stats1(df)

# Compute stats for each team in the Premier League
def stats2(df):
    # Group by 'HomeTeam' and calculate statistics
    home_team_stats = df.groupby('HomeTeam')[['B365H', 'B365D', 'B365A']].describe()
    
    # Group by 'AwayTeam' and calculate statistics
    away_team_stats = df.groupby('AwayTeam')[['B365H', 'B365D', 'B365A']].describe()
    
    # Return the stats
    df = home_team_stats, away_team_stats
    return df

#stats2(df)

# Compute rolling averages on performance metrics for each team in the Premier League over the past 5 matches
def stats3(df):
    # Compute rolling averages for home team performance metrics (shots, shots on target, goals, goal difference)
    df['HomeShotsRolling'] = df.groupby('HomeTeam', group_keys=False)['HS'].apply(lambda x: x.rolling(center=False, window=5).mean())
    df['HomeShotsOnTargetRolling'] = df.groupby('HomeTeam', group_keys=False)['HST'].apply(lambda x: x.rolling(center=False, window=5).mean())
    df['HomeGoalsRolling'] = df.groupby('HomeTeam', group_keys=False)['FTHG'].apply(lambda x: x.rolling(center=False, window=5).mean())
    
    # Compute rolling averages for away team performance metrics (shots, shots on target, goals, goal difference)
    df['AwayShotsRolling'] = df.groupby('AwayTeam', group_keys=False)['AS'].apply(lambda x: x.rolling(center=False, window=5).mean())
    df['AwayShotsOnTargetRolling'] = df.groupby('AwayTeam', group_keys=False)['AST'].apply(lambda x: x.rolling(center=False, window=5).mean())
    df['AwayGoalsRolling'] = df.groupby('AwayTeam', group_keys=False)['FTAG'].apply(lambda x: x.rolling(center=False, window=5).mean())
    
    # Drop missing values from Rolling columns
    df = df.dropna(subset=['HomeShotsRolling', 'HomeShotsOnTargetRolling', 'HomeGoalsRolling', 'AwayShotsRolling', 'AwayShotsOnTargetRolling', 'AwayGoalsRolling'])

    return df

df = stats3(df)

In [52]:
def format_dtypes(df):
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
    min_date = df['Date'].min()
    df['Date'] = (df['Date'] - min_date).dt.days
    df['Date'] = df['Date'].astype('float64')
    
    # Convert HomeTeam and AwayTeam to category codes
    df['HomeTeamCode'] = df['HomeTeam'].astype('category').cat.codes
    df['AwayTeamCode'] = df['AwayTeam'].astype('category').cat.codes
    team_mapping = dict(enumerate(df['HomeTeam'].astype('category').cat.categories))
    
    # Convert FTR to category codes
    df['FTR'] = df['FTR'].astype('category').cat.codes

    # Convert numeric columns
    numerical_columns = ['B365H', 'B365D', 'B365A', 'FTHG', 'FTAG',
                         'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HomeShotsAcc',
                         'AwayShotsAcc', 'HomeShotsRolling', 'HomeShotsOnTargetRolling', 'HomeGoalsRolling',
                         'AwayShotsRolling', 'AwayShotsOnTargetRolling',
                         'AwayGoalsRolling']

    for col in numerical_columns:
        if df[col].dtype == 'object':
            # Handle columns with percentage signs
            if 'Acc' in col or 'Rate' in col:
                df[col] = df[col].str.replace('%', '').astype('float').fillna(0) / 100.0
            else:
                try:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                except ValueError:
                    print(f"Error converting column: {col}")
                    pass
        else:
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except ValueError:
                print(f"Error converting column: {col}")
                pass
        
    # Convert HomeTeamCode and AwayTeamCode to float64 and FTR to float64
    df['HomeTeamCode'] = df['HomeTeamCode'].astype('float64')
    df['AwayTeamCode'] = df['AwayTeamCode'].astype('float64')
    df['FTR'] = df['FTR'].astype('float64')

    # Remove HomeTeam and AwayTeam columns
    df = df.drop(['HomeTeam', 'AwayTeam'], axis=1)
    
    return df, team_mapping

# Call the format_dtypes function to get the modified DataFrame and team_mapping dictionary
df, team_mapping = format_dtypes(df)

df.to_csv('test.csv', index=False)

# Print the team_mapping dictionary
print("Team Mapping:")
for code, team in team_mapping.items():
    print(f"Code: {code}, Team: {team}")

# Print FTR encodings
print("FTR encodings: ", df.FTR.unique())

# Print their mappings to original values
print("FTR mappings: ", df.FTR.map({0: 'A', 1: 'D', 2: 'H'}).unique())

# Save as a csv file called team_mapping.csv
team_mapping_df = pd.DataFrame.from_dict(team_mapping, orient='index', columns=['Team'])
team_mapping_df.index.name = 'Code'
team_mapping_df.reset_index(inplace=True)
team_mapping_df.to_csv('team_mapping.csv', index=False)

Team Mapping:
Code: 0, Team: Arsenal
Code: 1, Team: Aston Villa
Code: 2, Team: Birmingham
Code: 3, Team: Blackburn
Code: 4, Team: Blackpool
Code: 5, Team: Bolton
Code: 6, Team: Bournemouth
Code: 7, Team: Brentford
Code: 8, Team: Brighton
Code: 9, Team: Burnley
Code: 10, Team: Cardiff
Code: 11, Team: Charlton
Code: 12, Team: Chelsea
Code: 13, Team: Crystal Palace
Code: 14, Team: Derby
Code: 15, Team: Everton
Code: 16, Team: Fulham
Code: 17, Team: Huddersfield
Code: 18, Team: Hull
Code: 19, Team: Leeds
Code: 20, Team: Leicester
Code: 21, Team: Liverpool
Code: 22, Team: Man City
Code: 23, Team: Man United
Code: 24, Team: Middlesbrough
Code: 25, Team: Newcastle
Code: 26, Team: Norwich
Code: 27, Team: Nott'm Forest
Code: 28, Team: Portsmouth
Code: 29, Team: QPR
Code: 30, Team: Reading
Code: 31, Team: Sheffield United
Code: 32, Team: Southampton
Code: 33, Team: Stoke
Code: 34, Team: Sunderland
Code: 35, Team: Swansea
Code: 36, Team: Tottenham
Code: 37, Team: Watford
Code: 38, Team: West Brom

In [53]:
def correlation_matrix(df):
    # Exclude columns that don't make sense for correlation matrix
    excluded_columns = ['HomeTeamCode', 'AwayTeamCode']
    
    # Select only numeric columns for correlation matrix
    numeric_columns = df.select_dtypes(include=['number']).columns
    valid_columns = [col for col in numeric_columns if col not in excluded_columns]
    correlation_matrix = df[valid_columns].corr()
    
    # Plot the correlation heatmap
    plt.figure(figsize=(20, 15))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
    plt.title("Correlation Heatmap")
    plt.show()
    
    # Create a list of coefficient pairs and sort in descending order
    correlations = correlation_matrix.unstack().sort_values(ascending=False)
    
    # Filter out pairs where both variables are the same or have a correlation of 1
    filtered_correlations = correlations[
        (correlations.index.get_level_values(0) != correlations.index.get_level_values(1)) &
        (correlations != 1.0)
    ]
    
    # Print the sorted list of coefficient pairs
    print("Top Correlations:")
    print(filtered_correlations)

# correlation_matrix(df)

def pairplot2(df):
    sns.set(style="white")
    sns.pairplot(df, diag_kind="kde")
    plt.show()

# pairplot2(df)

In [54]:
def data_split(df):
    # Features
    X = df.drop(columns=['FTR', 'FTHG', 'FTAG'])

    # Target variable (label 1: Home Win, label 0: Draw or Away Win)
    y = df['FTR'].apply(lambda x: 1.0 if x == 2.0 else 0.0)

    # Splitting the data into train, test and validation sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

    return X_train, X_test, X_val, y_train, y_test, y_val

X_train, X_test, X_val, y_train, y_test, y_val = data_split(df)

In [None]:
def RandomForest(X_train, X_val, y_train, y_val):
    """Random Forest Classifier with 100 estimators and feature importance analysis"""
    # Initialize Random Forest Classifier
    clf = RandomForestClassifier()

    # Fit the model
    clf.fit(X_train, y_train)

    # Predict the results
    y_pred = clf.predict(X_val)

    # Print the accuracy
    print("Accuracy:", accuracy_score(y_val, y_pred))

    # Print classification report
    print(classification_report(y_val, y_pred))

    return clf

RandomForest(X_train, X_val, y_train, y_val)

In [None]:
# Feature importance analysis
feature_importance = RandomForest(X_train, X_val, y_train, y_val)
feature_importance = pd.DataFrame(clf.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

In [None]:
def LinearSVM(X_train, X_val, y_train, y_val):
    """Linear SVM classifier"""
    # Create a classifier
    svm = LinearSVC()

    # Train the classifier
    svm.fit(X_train, y_train)

    # Predict the outcome of a game
    prediction = svm.predict(X_val)

    # Print the accuracy score
    print(accuracy_score(y_val, prediction))

    # Print the classification report
    print(classification_report(y_val, prediction))

    # Compute confusion matrix
    cm = confusion_matrix(y_val, prediction)
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

LinearSVM(X_train, X_val, y_train, y_val)

In [None]:
def SVC_classifier(X_train, y_train, X_val, y_val):
    """SVC classifier with linear kernel and C=0.01"""
    # Create SVM classifier
    svm_clf = SVC()

    # Fit classifier to training set
    svm_clf.fit(X_train, y_train)

    # Predict FTR for test set
    y_pred = svm_clf.predict(X_val)

    # Evaluate performance of classifier
    print('Accuracy:', accuracy_score(y_val, y_pred))

    # classification report
    print(classification_report(y_val, y_pred))

SVC_classifier(X_train, y_train, X_val, y_val)

In [None]:
def GB_classifier(X_train, X_val, y_train, y_val):
    """Gradient Boosting Classifier with log loss function, 20000 estimators, learning rate of 0.1, and max depth of 1"""
    # Create a Gradient Boosting Classifier
    GB = GradientBoostingClassifier()

    # Fit the model
    GB.fit(X_train, y_train)

    # Predict the model
    y_pred = GB.predict(X_val)

    # Print the accuracy score
    print("Accuracy score:", accuracy_score(y_val, y_pred))

    # Print classification report
    print("Classification Report: \n {}\n".format(classification_report(y_val, y_pred)))
    
    return GB

GB_classifier(X_train, X_val, y_train, y_val)

In [None]:
def logistic_regression(X_train, X_val, y_train, y_val):
    """Logistic Regression with 1000 iterations"""
    # Create a logistic regression classifier
    log_reg = LogisticRegression()

    # Fit the model
    log_reg.fit(X_train, y_train)

    # Predict the model
    y_pred = log_reg.predict(X_val)

    # Print the accuracy score
    print("Accuracy score:", accuracy_score(y_val, y_pred))

    # Print classification report
    print("Classification Report: \n {}\n".format(classification_report(y_val, y_pred)))

logistic_regression(X_train, X_val, y_train, y_val)

In [55]:
param_grid_rf = {
    'random_state': [0],
    'n_estimators': [100, 200, 500, 1000, 2000],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'criterion': ['gini', 'log_loss']
}

param_grid_linSVM = {
    'random_state': [0],
    'max_iter': [1000, 2000, 5000, 10000, 20000, 50000, 100000],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2']
}

param_grid_SVC = {
    'random_state': [0],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': [1000, 2000, 5000, 10000, 20000, 50000]
}

param_grid_GB = {
    'random_state': [0],
    'n_estimators': [100, 200, 500, 1000, 2000],
    'learning_rate': [0.01, 0.1, 1, 10, 100],
    'loss': ['log_loss'],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

param_grid_log_reg = {
    'random_state': [0],
    'max_iter': [1000, 2000, 5000, 10000, 20000, 50000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100]
}

# Combine all parameter grids into a dictionary
param_grids = {
    RandomForestClassifier: param_grid_rf,
    LinearSVC: param_grid_linSVM,
    SVC: param_grid_SVC,
    GradientBoostingClassifier: param_grid_GB,
    LogisticRegression: param_grid_log_reg
}

# Dictionary of model functions
models = {
    RandomForestClassifier,
    LinearSVC,
    SVC,
    GradientBoostingClassifier,
    LogisticRegression
}

def grid_search(models, param_grids, X_train, y_train):
    """GridSearchCV to find the best parameters and models"""
    for model_class in models:
        model_name = model_class.__name__
        print(f"Model: {model_name}")
        param_grid = param_grids[model_class]
        model = model_class()  # Instantiate the model class
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best score: {grid_search.best_score_}")
        print('')

grid_search(models, param_grids, X_train, y_train)

Model: LogisticRegression


300 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\marku\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\marku\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\marku\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

----------------------

Best parameters: {'C': 10, 'max_iter': 2000, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}
Best score: 0.7558252427184465

Model: RandomForestClassifier
Best parameters: {'criterion': 'gini', 'max_depth': None, 'n_estimators': 200, 'random_state': 0}
Best score: 0.737378640776699

Model: SVC




Best parameters: {'C': 1000, 'kernel': 'rbf', 'max_iter': 50000, 'random_state': 0}
Best score: 0.693042071197411

Model: LinearSVC


245 fits failed out of a total of 490.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
245 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\marku\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\marku\anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "c:\Users\marku\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "c:\Users\marku\anaconda3\lib\site-packages\sklearn\svm\_base.py", line

Best parameters: {'C': 0.001, 'max_iter': 100000, 'penalty': 'l2', 'random_state': 0}
Best score: 0.7323624595469255

Model: GradientBoostingClassifier
Best parameters: {'learning_rate': 0.1, 'loss': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 200, 'random_state': 0}
Best score: 0.7529126213592232

