In [1]:
#import libraries
import pandas as pd

In [3]:
#import the data
ufc_data = pd.read_csv('UFC_Fighter_Statistics.csv')

In [5]:
#count null values
print(ufc_data.isnull().sum())

name                                               0
nickname                                        1854
wins                                               0
losses                                             0
draws                                              0
height_cm                                        298
weight_in_kg                                      87
reach_in_cm                                     1927
stance                                           823
date_of_birth                                   1135
significant_strikes_landed_per_minute              0
significant_striking_accuracy                      0
significant_strikes_absorbed_per_minute            0
significant_strike_defence                         0
average_takedowns_landed_per_15_minutes            0
takedown_accuracy                                  0
takedown_defense                                   0
average_submissions_attempted_per_15_minutes       0
dtype: int64


In [7]:
ufc_data.shape

(4111, 18)

In [9]:
# Fill missing values in reach_in_cm with corresponding height_cm values
ufc_data = ufc_data.assign(reach_in_cm=ufc_data['reach_in_cm'].fillna(ufc_data['height_cm']))

In [11]:
print(ufc_data.isnull().sum())

name                                               0
nickname                                        1854
wins                                               0
losses                                             0
draws                                              0
height_cm                                        298
weight_in_kg                                      87
reach_in_cm                                      297
stance                                           823
date_of_birth                                   1135
significant_strikes_landed_per_minute              0
significant_striking_accuracy                      0
significant_strikes_absorbed_per_minute            0
significant_strike_defence                         0
average_takedowns_landed_per_15_minutes            0
takedown_accuracy                                  0
takedown_defense                                   0
average_submissions_attempted_per_15_minutes       0
dtype: int64


In [13]:
# Drop the nickname column
ufc_data = ufc_data.drop(columns=['nickname'])

In [15]:
print(ufc_data.isnull().sum())

name                                               0
wins                                               0
losses                                             0
draws                                              0
height_cm                                        298
weight_in_kg                                      87
reach_in_cm                                      297
stance                                           823
date_of_birth                                   1135
significant_strikes_landed_per_minute              0
significant_striking_accuracy                      0
significant_strikes_absorbed_per_minute            0
significant_strike_defence                         0
average_takedowns_landed_per_15_minutes            0
takedown_accuracy                                  0
takedown_defense                                   0
average_submissions_attempted_per_15_minutes       0
dtype: int64


In [17]:
# Convert date_of_birth to datetime format
ufc_data['date_of_birth'] = pd.to_datetime(ufc_data['date_of_birth'], errors='coerce')

# Calculate age
today = pd.to_datetime("today")
ufc_data['age'] = (today - ufc_data['date_of_birth']).dt.days // 365

# Drop the original date_of_birth column
ufc_data = ufc_data.drop(columns=['date_of_birth'])

In [26]:
# Fill missing values in age with the average age (rounded to nearest integer)
ufc_data['age'] = ufc_data['age'].fillna(ufc_data['age'].mean()).astype(int)

In [28]:
ufc_data.head

<bound method NDFrame.head of                    name  wins  losses  draws  height_cm  weight_in_kg  \
0       Robert Drysdale     7       0      0     190.50         92.99   
1     Daniel McWilliams    15      37      0     185.42         83.91   
2            Dan Molina    13       9      0     177.80         97.98   
3             Paul Ruiz     7       4      0     167.64         61.23   
4       Collin Huckbody     8       2      0     190.50         83.91   
...                 ...   ...     ...    ...        ...           ...   
4106    John Campetella     0       1      0     175.26        106.59   
4107  Andre Pederneiras     1       1      2     172.72         70.31   
4108      Bryson Kamaka    12      20      1     180.34         77.11   
4109        Matej Penaz     6       1      0     190.50         83.91   
4110     Pauline Macias     4       1      0     162.56         52.16   

      reach_in_cm    stance  significant_strikes_landed_per_minute  \
0          190.50  Orth

In [30]:
print(ufc_data.isnull().sum())

name                                              0
wins                                              0
losses                                            0
draws                                             0
height_cm                                       298
weight_in_kg                                     87
reach_in_cm                                     297
stance                                          823
significant_strikes_landed_per_minute             0
significant_striking_accuracy                     0
significant_strikes_absorbed_per_minute           0
significant_strike_defence                        0
average_takedowns_landed_per_15_minutes           0
takedown_accuracy                                 0
takedown_defense                                  0
average_submissions_attempted_per_15_minutes      0
age                                               0
dtype: int64


In [34]:
# Fill missing values in stance with 'Orthodox' because 80% of all fighters are orthodox
ufc_data['stance'] = ufc_data['stance'].fillna('Orthodox')

In [36]:
print(ufc_data.isnull().sum())

name                                              0
wins                                              0
losses                                            0
draws                                             0
height_cm                                       298
weight_in_kg                                     87
reach_in_cm                                     297
stance                                            0
significant_strikes_landed_per_minute             0
significant_striking_accuracy                     0
significant_strikes_absorbed_per_minute           0
significant_strike_defence                        0
average_takedowns_landed_per_15_minutes           0
takedown_accuracy                                 0
takedown_defense                                  0
average_submissions_attempted_per_15_minutes      0
age                                               0
dtype: int64


In [40]:
# Fill missing values in numeric columns with their respective column means
ufc_data = ufc_data.fillna(ufc_data.mean(numeric_only=True))

In [42]:
print(ufc_data.isnull().sum())

name                                            0
wins                                            0
losses                                          0
draws                                           0
height_cm                                       0
weight_in_kg                                    0
reach_in_cm                                     0
stance                                          0
significant_strikes_landed_per_minute           0
significant_striking_accuracy                   0
significant_strikes_absorbed_per_minute         0
significant_strike_defence                      0
average_takedowns_landed_per_15_minutes         0
takedown_accuracy                               0
takedown_defense                                0
average_submissions_attempted_per_15_minutes    0
age                                             0
dtype: int64


In [44]:
ufc_data.head

<bound method NDFrame.head of                    name  wins  losses  draws  height_cm  weight_in_kg  \
0       Robert Drysdale     7       0      0     190.50         92.99   
1     Daniel McWilliams    15      37      0     185.42         83.91   
2            Dan Molina    13       9      0     177.80         97.98   
3             Paul Ruiz     7       4      0     167.64         61.23   
4       Collin Huckbody     8       2      0     190.50         83.91   
...                 ...   ...     ...    ...        ...           ...   
4106    John Campetella     0       1      0     175.26        106.59   
4107  Andre Pederneiras     1       1      2     172.72         70.31   
4108      Bryson Kamaka    12      20      1     180.34         77.11   
4109        Matej Penaz     6       1      0     190.50         83.91   
4110     Pauline Macias     4       1      0     162.56         52.16   

      reach_in_cm    stance  significant_strikes_landed_per_minute  \
0          190.50  Orth

In [46]:
ufc_data.to_csv('ufc_data_cleaned.csv', index=False)

In [48]:
git init


SyntaxError: invalid syntax (3277417328.py, line 1)