In [35]:
import pandas as pd
import requests
import io
import time
import os # Import the os module for path manipulation
from datetime import datetime
import glob



def download_historical_results(divisions=['E0', 'E1', 'E2', 'E3'], start_year=1993, end_year=2025):
    """
    Downloads historical English football results and saves them to a local folder.
    
    Args:
        divisions (list): A list of division codes (e.g., 'E0' for Premier League).
        start_year (int): The starting year for the data.
        end_year (int): The current year, for which data will be downloaded up to.
    """
        
    # Generate a list of seasons to scrape
    seasons = [f'{str(year)[2:]}{str(year+1)[2:]}' for year in range(start_year, end_year + 1)]
    
    print(f"Downloading data for seasons from {seasons[0]} to {seasons[-1]}...")
    
    for division in divisions:
        for season in seasons:
            url = f'https://www.football-data.co.uk/mmz4281/{season}/{division}.csv'
            
            try:
                response = requests.get(url)
                response.raise_for_status()
                df = pd.read_csv(io.StringIO(response.text),usecols=[   'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR'])
                df['Season'] = season
                df['Division'] = division
                df.dropna(subset=['Date'], inplace=True)
                df.to_csv(f"data/{division}/{division}_{season}.csv", index=False)

                print(f'Successfully downloaded {division} data for season {season}.')
                time.sleep(1)
            
            except requests.exceptions.RequestException as e:
                print(f'Error downloading {division} data for season {season}: {e}')
                continue
                

# Example usage:
if __name__ == '__main__':
    download_historical_results(start_year=1993, end_year=2025)

Downloading data for seasons from 9394 to 2526...
Successfully downloaded E0 data for season 9394.
Successfully downloaded E0 data for season 9495.
Successfully downloaded E0 data for season 9596.
Successfully downloaded E0 data for season 9697.
Successfully downloaded E0 data for season 9798.
Successfully downloaded E0 data for season 9899.
Successfully downloaded E0 data for season 9900.
Successfully downloaded E0 data for season 0001.
Successfully downloaded E0 data for season 0102.
Successfully downloaded E0 data for season 0203.
Successfully downloaded E0 data for season 0304.
Successfully downloaded E0 data for season 0405.
Successfully downloaded E0 data for season 0506.
Successfully downloaded E0 data for season 0607.
Successfully downloaded E0 data for season 0708.
Successfully downloaded E0 data for season 0809.
Successfully downloaded E0 data for season 0910.
Successfully downloaded E0 data for season 1011.
Successfully downloaded E0 data for season 1112.
Successfully downlo

In [36]:
def read_in_data_from_folder(divisions=['E0', 'E1', 'E2', 'E3'],start_year=1993, end_year=2025, columns_to_keep = ['Division','Season','Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'filename'], print=False):
    """
    Read in all csv files from a folder and concatonate it into one pandas dataframe.
    """

    pd_base = pd.DataFrame()

    seasons = [f'{str(year)[2:]}{str(year+1)[2:]}' for year in range(start_year, end_year + 1)]


    pd_base = pd.DataFrame()

    for Division in divisions:
        all_files = glob.glob(os.path.join(f"data/{Division}" , "*.csv"))
        for filename in all_files:
            df = pd.read_csv(filename)
            df['filename'] = filename # add filename path as column
            pd_base = pd.concat([pd_base, df], ignore_index=True)
            
    # Keep only some columns
    # If columns_to_keep is not empty, keep only those columns. If empty, keep all columns:
    # if columns_to_keep:
    #     pd_base = pd_base[columns_to_keep]

    # If there is a 'Date' column then convert it to datetime with format '%d/%m/%Y':
    if 'Date' in pd_base.columns:
        date1 = pd.to_datetime(pd_base['Date'], errors='coerce', format='%d/%m/%Y')
        date2 = pd.to_datetime(pd_base['Date'], errors='coerce', format='%d/%m/%y')
        pd_base['Date'] = date1.fillna(date2)
    
    pd_base = pd_base.sort_values(by=['Date'],ascending=True)

    pd_base.to_csv('data/all_data.csv', index=False)
    return pd_base

read_in_data_from_folder()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Season,Division,filename
0,1993-08-14,Arsenal,Coventry,0.0,3.0,A,9394,E0,data/E0/E0_9394.csv
47699,1993-08-14,Preston,Crewe,0.0,2.0,A,9394,E3,data/E3/E3_9394.csv
47698,1993-08-14,Mansfield,Shrewsbury,1.0,0.0,H,9394,E3,data/E3/E3_9394.csv
47697,1993-08-14,Hereford,Scarborough,0.0,1.0,A,9394,E3,data/E3/E3_9394.csv
47696,1993-08-14,Gillingham,Chesterfield,0.0,2.0,A,9394,E3,data/E3/E3_9394.csv
...,...,...,...,...,...,...,...,...,...
30079,2025-09-13,Preston,Middlesbrough,2.0,2.0,D,2526,E1,data/E1/E1_2526.csv
30080,2025-09-13,Coventry,Norwich,1.0,1.0,D,2526,E1,data/E1/E1_2526.csv
30087,2025-09-14,Southampton,Portsmouth,0.0,0.0,D,2526,E1,data/E1/E1_2526.csv
12363,2025-09-14,Man City,Man United,3.0,0.0,H,2526,E0,data/E0/E0_2526.csv
