In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/new-parsed-scorecards/SCORECARDS.csv
/kaggle/input/scraped-stats/STATS.csv


# Data preprocessing

## Reading stats & scorecards CSVs

In [26]:
scorecards = pd.read_csv("/kaggle/input/new-parsed-scorecards/SCORECARDS.csv", sep=';')
stats = pd.read_csv("/kaggle/input/scraped-stats/STATS.csv", sep=";")

scorecards.head()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_total_pts,blue_fighter_total_pts
0,POLIANA BOTELHO,LUANA CAROLINA,01/05/2021,29 28 28,28 29 29
1,KAI KAMAKA,TJ BROWN,01/05/2021,28 28 30,29 29 27
2,MERAB DVALISHVILI,CODY STAMANN,01/05/2021,30 29 29,27 28 28
3,RANDA MARKOS,LUANA PINHEIRO,01/05/2021,- - -,- - -
4,ANDREAS MICHAILIDIS,KB BHULLAR,01/05/2021,29 30 30,28 27 27


In [27]:
stats.head()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,red_fighter_sig_str_body_pct,blue_fighter_sig_str_body_pct,red_fighter_sig_str_leg_pct,blue_fighter_sig_str_leg_pct,red_fighter_sig_str_distance_pct,blue_fighter_sig_str_distance_pct,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct
0,ILIA TOPURIA,MAX HOLLOWAY,26/10/2024,El Matador,Blessed,W,L,KO/TKO,3,1:34,...,14,16,20,24,94,100,0,0,5,0
1,ROBERT WHITTAKER,KHAMZAT CHIMAEV,26/10/2024,The Reaper,Borz,L,W,Submission,1,3:34,...,0,33,100,0,100,0,0,0,0,100
2,MAGOMED ANKALAEV,ALEKSANDAR RAKIC,26/10/2024,-,Rocket,W,L,Decision - Unanimous,3,5:00,...,40,16,23,64,90,94,9,5,0,0
3,LERONE MURPHY,DAN IGE,26/10/2024,The Miracle,50K,W,L,Decision - Unanimous,3,5:00,...,23,10,7,13,71,69,23,13,5,17
4,SHARA MAGOMEDOV,ARMEN PETROSYAN,26/10/2024,Bullet,Superman,W,L,KO/TKO,2,4:52,...,44,12,18,58,96,97,3,2,0,0


## Function that swaps American date format into European date format

In [37]:
def date_month_swap(date):
    month, day, year = date.split("/")
    return f"{day}/{month}/{year}"


scorecards['event_date'] = scorecards['event_date'].apply(lambda x: date_month_swap(x))

## Function that converts alphabetical month to numerical

In [29]:
months = {
    "January": '01', "February": '02', "March": '03',
    "April": '04', "May": '05', "June": '06',
    "July": '07', "August": '08', "September": '09',
    "October": '10', "November": '11', "December": '12'
}

def convert_month_to_numerical(date):
    month, day, year = date.replace(",", "").split()
    return f"{day}/{months[month]}/{year}"

stats['event_date'] = stats['event_date'].apply(lambda x: convert_month_to_numerical(x))

'months = {\n    "January": \'01\', "February": \'02\', "March": \'03\',\n    "April": \'04\', "May": \'05\', "June": \'06\',\n    "July": \'07\', "August": \'08\', "September": \'09\',\n    "October": \'10\', "November": \'11\', "December": \'12\'\n}\n\ndef convert_month_to_numerical(date):\n    month, day, year = date.replace(",", "").split()\n    return f"{day}/{months[month]}/{year}"\n\nstats[\'event_date\'] = stats[\'event_date\'].apply(lambda x: convert_month_to_numerical(x))'

## Function that converts months and dates from m / d to mm / dd format

In [30]:
def month_conversion(date):
    date = date.replace(" ", "")
    day, month, year = date.split("/")

    if int(day) < 10 and not day.startswith("0"):
        day = f"0{day}"
    if int(month) < 10 and not month.startswith("0"):
        month = f"0{month}"

    return f"{day}/{month}/{year}"

scorecards['event_date'] = scorecards['event_date'].apply(lambda x: month_conversion(x))

In [33]:
stats['event_date'][1043:1050]

1043    15/10/2022
1044    15/10/2022
1045    15/10/2022
1046    15/10/2022
1047    15/10/2022
1048    15/10/2022
1049    01/10/2022
Name: event_date, dtype: object

In [34]:
scorecards['event_date'][1043:1050]

1043    14/11/2020
1044    14/11/2020
1045    14/11/2020
1046    14/11/2020
1047    14/11/2020
1048    14/11/2020
1049    14/11/2020
Name: event_date, dtype: object

# Merging stats and scorecards

In [38]:
merged_stats_scorecards = pd.merge(stats, scorecards, how='left', on=['red_fighter_name', 'blue_fighter_name', 'event_date'])
merged_stats_scorecards.to_csv('merged_stats_n_scorecards.csv', index=False)
merged_stats_scorecards.isna().sum()

red_fighter_name                      0
blue_fighter_name                     0
event_date                            0
red_fighter_nickname                  0
blue_fighter_nickname                 0
                                   ... 
blue_fighter_sig_str_clinch_pct       0
red_fighter_sig_str_ground_pct        0
blue_fighter_sig_str_ground_pct       0
red_fighter_total_pts              2120
blue_fighter_total_pts             2120
Length: 61, dtype: int64