In [44]:
import pandas as pd

def format_files(file_folder, match_file, stats_file):

    # Read in friendly internationals
    all_matches = pd.read_csv(file_folder+match_file)
    all_stats = pd.read_csv(file_folder+stats_file)

    print('starting games: ',len(all_matches.url))

    # Rename url column to key
    all_matches = all_matches.rename(columns={"url": "key"})

    # Create key column to join on
    all_stats['key'] = all_stats['url'].str.replace('/match-statistics/0', '', regex=False)

    # Drop columns
    if 'Unnamed: 0' in all_matches.columns:
        all_matches = all_matches.drop(columns=['Unnamed: 0'])
    if 'Unnamed: 0' in all_stats.columns:
        all_stats = all_stats.drop(columns=['Unnamed: 0'])

    # Filter for data that I need
    filtered_rows = all_stats[all_stats['Attribute'].isin(['Tries',
                                                       'Conversion Goals',
                                                       'Conversion Goal Attempts',
                                                        'Penalty Goals',
                                                       'Penalty Goal Attempts',
                                                        'Dropped Goal Attempts',
                                                         'Dropped Goals',
                                                      'Metres Run With Ball',
                                                       'Yellow Cards',
                                                       'Red Cards'])]

    # Drop URL columns
    filtered_rows = filtered_rows.drop(columns=['url'])

    # Create dfs for each attribute
    filtered_rows_tries = filtered_rows[filtered_rows['Attribute'] == 'Tries']
    filtered_rows_conv = filtered_rows[filtered_rows['Attribute'] == 'Conversion Goals']
    filtered_rows_conv_att = filtered_rows[filtered_rows['Attribute'] == 'Conversion Goal Attempts']
    filtered_rows_pg = filtered_rows[filtered_rows['Attribute'] == 'Penalty Goals']
    filtered_rows_pg_att = filtered_rows[filtered_rows['Attribute'] == 'Penalty Goal Attempts']
    filtered_rows_dg = filtered_rows[filtered_rows['Attribute'] ==  'Dropped Goals']
    filtered_rows_dg_att = filtered_rows[filtered_rows['Attribute'] == 'Dropped Goal Attempts']

    # Drop Attribute Column
    filtered_rows_tries = filtered_rows_tries.drop(columns=['Attribute'])
    filtered_rows_conv = filtered_rows_conv.drop(columns=['Attribute'])
    filtered_rows_conv_att = filtered_rows_conv_att.drop(columns=['Attribute'])
    filtered_rows_pg = filtered_rows_pg.drop(columns=['Attribute'])
    filtered_rows_pg_att = filtered_rows_pg_att.drop(columns=['Attribute'])
    filtered_rows_dg = filtered_rows_dg.drop(columns=['Attribute'])
    filtered_rows_dg_att = filtered_rows_dg_att.drop(columns=['Attribute'])

    # Rename columns
    filtered_rows_tries = filtered_rows_tries.rename(columns={"Home Result": "Home_tries",
                                    "Away Result": "Away_tries"})
    filtered_rows_conv = filtered_rows_conv.rename(columns={"Home Result": "Home_conv",
                                    "Away Result": "Away_conv"})
    filtered_rows_conv_att = filtered_rows_conv_att.rename(columns={"Home Result": "Home_conv_att",
                                    "Away Result": "Away_conv_att"})
    filtered_rows_pg = filtered_rows_pg.rename(columns={"Home Result": "Home_pg",
                                    "Away Result": "Away_pg"})
    filtered_rows_pg_att = filtered_rows_pg_att.rename(columns={"Home Result": "Home_pg_att",
                                    "Away Result": "Away_pg_att"})
    filtered_rows_dg = filtered_rows_dg.rename(columns={"Home Result": "Home_dg",
                                    "Away Result": "Away_dg"})
    filtered_rows_dg_att = filtered_rows_dg_att.rename(columns={"Home Result": "Home_dg_att",
                                    "Away Result": "Away_dg_att"})


    # Now Join back to dataset - keep this as Inner Join
    merge_df = pd.merge(all_matches,filtered_rows_tries,
                    left_on = 'key', right_on = 'key', how='inner')
    merge_df = pd.merge(merge_df,filtered_rows_conv,
                    left_on = 'key', right_on = 'key', how='inner')
    merge_df = pd.merge(merge_df,filtered_rows_conv_att,
                    left_on = 'key', right_on = 'key', how='inner')
    merge_df = pd.merge(merge_df,filtered_rows_pg,
                    left_on = 'key', right_on = 'key', how='inner')
    merge_df = pd.merge(merge_df,filtered_rows_pg_att,
                    left_on = 'key', right_on = 'key', how='inner')
    merge_df = pd.merge(merge_df,filtered_rows_dg,
                    left_on = 'key', right_on = 'key', how='inner')
    merge_df = pd.merge(merge_df,filtered_rows_dg_att,
                    left_on = 'key', right_on = 'key', how='inner')

    print('final games: ',len(merge_df.key))

    return merge_df

In [45]:
# Set folder
set_folder = 'C:/Users/killi/KF_Repo/PGA_Golf/Python_Scripts/Rugby_Union/1_Scrape_Data/Data/'


internationals_formatted = format_files(set_folder,'flashscore_internationals_2024.csv', 
                                        'flashscore_internationals_2024_all_stats.csv')
six_nations_formatted = format_files(set_folder,'flashscore_six_nations_2024.csv', 
                                     'flashscore_six_nations_2024_all_stats.csv')

rugby_championship_formatted = format_files(set_folder,'flashscore_rugby_championship_2024.csv', 
                                     'flashscore_rugby_championship_2024_all_stats.csv')

pacifc_nations_formatted = format_files(set_folder,'flashscore_pacific_nations_2024.csv', 
                                     'flashscore_pacific_nations_2024_all_stats.csv')

world_cup_formatted = format_files(set_folder,'flashscore_world_cup_2023.csv', 
                                     'flashscore_world_cup_2023_all_stats.csv')



starting games:  84
final games:  25
starting games:  15
final games:  15
starting games:  12
final games:  12
starting games:  11
final games:  0
starting games:  54
final games:  48


In [57]:
# Find the missing files from world cup

file = 'flashscore_world_cup_2023.csv'
orig_file = pd.read_csv(file_folder+file)
# Rename url column to key
orig_file = orig_file.rename(columns={"url": "key"})

# These files are from the world cup qualification so can be ignored
world_cup_missing = orig_file[~orig_file['key'].isin(world_cup_formatted['key'])]

In [69]:
# Create list of games that we need to gather statistics from

file = 'flashscore_pacific_nations_2024.csv'
orig_file = pd.read_csv(file_folder+file)
# Rename url column to key
orig_file = orig_file.rename(columns={"url": "key"})

# These files are from the world cup qualification so can be ignored
pacific_missing = orig_file

# Find international missing

file = 'flashscore_internationals_2024.csv'
orig_file = pd.read_csv(file_folder+file)
# Rename url column to key
orig_file = orig_file.rename(columns={"url": "key"})

# Find international missing
international_missing = orig_file[~orig_file['key'].isin(internationals_formatted['key'])]

# Combine missing games
all_missing = pd.concat([pacific_missing,international_missing],axis=0, ignore_index=True )

# Save as csv
all_missing.to_csv('all_missing.csv',index=False)


In [79]:
# Function to convert the date format with the given year
def convert_date(date_str, year):
    # Add the year to the date string and convert to datetime
    return pd.to_datetime(f'{year}.' + date_str.split('.')[0] + '.' + date_str.split('.')[1], format='%Y.%d.%m')


# Apply the function to the 'date' column and pass the year as an argument
world_cup_formatted['date_2'] = world_cup_formatted['date'].apply(lambda x: convert_date(x, 2023))
internationals_formatted['date_2'] = internationals_formatted['date'].apply(lambda x: convert_date(x, 2024))
six_nations_formatted['date_2'] = six_nations_formatted['date'] .apply(lambda x: convert_date(x, 2024))
rugby_championship_formatted['date_2'] = rugby_championship_formatted['date'].apply(lambda x: convert_date(x, 2024))

# Combine the data that we do have 

all_have_statistics = pd.concat([internationals_formatted,six_nations_formatted,
                                 rugby_championship_formatted,world_cup_formatted])

print(len(all_have_statistics))

# Save as csv
all_have_statistics.to_csv('all_have_statistics.csv',index=False)


100
