#### Goal: Make model to predict winner of UFC fights using this data
- XGBoost
- Target Feature: **`winner`**
- Classification (Winner or Loser)

In [1]:
import pandas as pd

- Not super important right now, this is just to indicate what columns will need to be dropped from modeling dataset

In [56]:
df_clean: pd.DataFrame = pd.read_csv('../data/ufc-clean.csv')

columns: list[str,...] = list(df_clean.columns)
columns

['r_fighter',
 'b_fighter',
 'r_odds',
 'b_odds',
 'r_ev',
 'b_ev',
 'date',
 'location',
 'country',
 'winner',
 'title_bout',
 'weight_class',
 'gender',
 'no_of_rounds',
 'b_current_lose_streak',
 'b_current_win_streak',
 'b_draw',
 'b_avg_sig_str_landed',
 'b_avg_sig_str_pct',
 'b_avg_sub_att',
 'b_avg_td_landed',
 'b_avg_td_pct',
 'b_longest_win_streak',
 'b_losses',
 'b_total_rounds_fought',
 'b_total_title_bouts',
 'b_win_by_decision_majority',
 'b_win_by_decision_split',
 'b_win_by_decision_unanimous',
 'b_win_by_ko/tko',
 'b_win_by_submission',
 'b_win_by_tko_doctor_stoppage',
 'b_wins',
 'b_stance',
 'b_height_cms',
 'b_reach_cms',
 'b_weight_lbs',
 'r_current_lose_streak',
 'r_current_win_streak',
 'r_draw',
 'r_avg_sig_str_landed',
 'r_avg_sig_str_pct',
 'r_avg_sub_att',
 'r_avg_td_landed',
 'r_avg_td_pct',
 'r_longest_win_streak',
 'r_losses',
 'r_total_rounds_fought',
 'r_total_title_bouts',
 'r_win_by_decision_majority',
 'r_win_by_decision_split',
 'r_win_by_decision_un

In [None]:
dif_columns: list[str,...] = [column for column in columns if 'dif' in column]
betting_columns: list[str,...] = [column for column in columns if 'odds' in column or 'ev' in column]
rank_columns: list[str,...] = [column for column in columns if 'rank' in column]
info_columns: list[str,...] = ['date', 'location', 'country',]

drop_columns: list[str,...] = betting_columns + rank_columns + info_columns + dif_columns

In [3]:
# Ideally would like to get to this point where using ALL features that have not been dropped
# Until then, going to be setting basic_features as the features
features: list[str,...] = [
    column for column in list(df_clean.columns) 
    if column not in drop_columns
]

In [4]:
basic_features: list[str,...] = [
    'r_fighter',
    'b_fighter',
    'title_bout',
    'weight_class',
    'gender',
    'r_stance',
    'b_stance',
    'r_reach_cms',
    'b_reach_cms',
    'r_height_cms',
    'b_height_cms',
    'r_weight_lbs',
    'b_weight_lbs',
    'r_age',
    'b_age',
]
    
#     # Target
#     'winner'
# ]

#### Customizing Dataset
- Avoiding the word *transforming* so as not to be confused with sklearn transformers
- Goal is to have each fighter in own row and trying to determine winner
- Going to expand each row into two rows, one per fighter
- Will change `winner` to be either 0 or 1
- ***Going to start with just `basic_features`***

In [49]:
def customize_dataframe(df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
    
#     As of right now, setting the default features to basic_features, can pass in subset as list if would like
    features: list[str,...] = kwargs.get('features', basic_features)
    target: str = kwargs.get('target', 'winner')
        
    df = df.loc[:, features + [target]]
    
    # Important info relevant to fight/both fighters
    info_features = ['title_bout', 'weight_class', 'gender']
    
    # Abstract to accomodate any number of possible r_ or b_ features
    r_features: list[str,...] = [feature for feature in features if feature[:2] == 'r_']
    b_features: list[str,...] = [feature for feature in features if feature[:2] == 'b_']
    
    # Check to make sure have columns for both
    assert(len(r_features) == len(b_features))

    # Be careful of fighter, opp_fighter
    root_features: list[str,...] = [feature.replace('r_','') for feature in r_features]
    
    
#     Column order for transformed dataframe
    feature_order = sum([
        ['fighter', 'opp_fighter'],
        info_features,
        root_features[1:],
        [f'opp_{feature}' for feature in root_features[1:]],
        ['winner']
    ],[])
    
    frames = list()
    
    # Actual process
    for _, row in df.iterrows():
        
        # Before iterating, maybe set r_winner , b_winner as 0 or 1 as different way to keep track of winner??
        general_info = {
            **{feature: [row[feature]] * 2 for feature in info_features},
            **{'winner': [1,0] if row['winner'] == 'Red' else [0,1]}
        }
        
        red_info = dict()
        blue_info = dict()
        
        for feature in r_features:
            red_info[feature.replace('r_', '')] = row[feature]
            blue_info[feature.replace('r_', 'opp_')] = row[feature]
        
        for feature in b_features:
            blue_info[feature.replace('b_', '')] = row[feature]
            red_info[feature.replace('b_', 'opp_')] = row[feature]
            
        # TODO: Write some tests to prove this is true --> assert(red val == blue opp_val)...
        
        # Make sure red_info stays first so winner lines up
        data = {
            **general_info,
            **{feature: [red_info[feature], blue_info[feature]] for feature in feature_order if feature not in general_info}
        }
        
        frames.append(pd.DataFrame(data))

    clean_df: pd.DataFrame = (pd
                              .concat(frames)
                              .reset_index(drop=True)
                             )

    # Remove "Women's " from weight_class column for female fighters
    clean_df['weight_class'] = clean_df['weight_class'].map(lambda s: s.replace("Women's ", ""))

    # Simple feature engineering
    # Going to reduce all stats columns to the difference
    # Example: diff_reach_cms = reach_cms - opp_reach_cms
    opposing_stats: list[str,...] = [feature for feature in root_features if feature not in ('fighter', 'stance')]

    for stat in opposing_stats:
        clean_df[f'diff_{stat}'] = clean_df[stat] - clean_df[f'opp_{stat}']
        clean_df.drop([stat, f'opp_{stat}'], axis=1, inplace=True)


    # Aesthetics
    first_cols: list[str,...] = ['fighter', 'opp_fighter']
    feature_order: list[str,...] = first_cols + [column for column in clean_df.columns if column not in first_cols + [target]] + [target]
    
    return clean_df.loc[:, feature_order]
    
    

In [50]:
df_model: pd.DataFrame = customize_dataframe(df_clean)

In [52]:
df_model.to_csv('../data/model_dataset.csv', index=False)

In [51]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9792 entries, 0 to 9791
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   fighter          9792 non-null   object 
 1   opp_fighter      9792 non-null   object 
 2   title_bout       9792 non-null   uint8  
 3   weight_class     9792 non-null   object 
 4   gender           9792 non-null   object 
 5   stance           9790 non-null   object 
 6   opp_stance       9790 non-null   object 
 7   diff_reach_cms   9792 non-null   float64
 8   diff_height_cms  9792 non-null   float64
 9   diff_weight_lbs  9792 non-null   int64  
 10  diff_age         9792 non-null   int64  
 11  winner           9792 non-null   uint8  
dtypes: float64(2), int64(2), object(6), uint8(2)
memory usage: 784.3+ KB


- As one can see, this is a much more understandable dataset from a modeling perspective
- In the next notebook, I will be transforming this dataset into an optimal form for XGBoost

In [44]:
df_model.sample(10)

Unnamed: 0,fighter,opp_fighter,title_bout,weight_class,gender,stance,opp_stance,diff_reach_cms,diff_height_cms,diff_weight_lbs,diff_age,winner
2719,Nina Ansaroff,Claudia Gadelha,0,Strawweight,FEMALE,Orthodox,Orthodox,2.54,2.54,0,3,1
6043,Urijah Faber,Frankie Edgar,0,Featherweight,MALE,Orthodox,Orthodox,-2.54,0.0,-10,3,0
1552,Jussier Formiga,Joseph Benavidez,0,Flyweight,MALE,Orthodox,Southpaw,5.08,2.54,0,0,0
3720,Zabit Magomedsharipov,Sheymon Moraes,0,Featherweight,MALE,Orthodox,Orthodox,2.54,12.7,0,-1,1
5819,Dominique Steele,Zak Cummings,0,Middleweight,MALE,Orthodox,Southpaw,-2.54,-5.08,-15,-3,0
2787,Sergei Pavlovich,Alistair Overeem,0,Heavyweight,MALE,Southpaw,Orthodox,10.16,-2.54,-8,-12,0
8041,Renee Forte,Terry Etim,0,Lightweight,MALE,Orthodox,Orthodox,-5.08,-15.24,0,-2,1
4855,Charles Oliveira,Anthony Pettis,0,Featherweight,MALE,Orthodox,Orthodox,5.08,0.0,-15,-3,0
1366,Alonzo Menifield,Devin Clark,0,Light Heavyweight,MALE,Orthodox,Orthodox,2.54,0.0,0,2,0
3932,Luis Henrique,Arjan Bhullar,0,Heavyweight,MALE,Orthodox,Orthodox,-2.54,2.54,-2,-7,0


In [55]:
df_model['weight_class'].value_counts()

9792

In [54]:
df_model['stance'].value_counts()

9790

In [40]:
(df_model
 .groupby('fighter')
 ['winner']
 .agg(['count', 'mean'])
 .set_axis(['num-fights', 'win %'], axis=1)
 .sort_values('num-fights', ascending=False)
 .round(2)
)

Unnamed: 0_level_0,num-fights,win %
fighter,Unnamed: 1_level_1,Unnamed: 2_level_1
Donald Cerrone,36,0.64
Jim Miller,30,0.53
Charles Oliveira,27,0.70
Edson Barboza,26,0.62
Rafael Dos Anjos,26,0.65
...,...,...
Charlie Valencia,1,0.00
Nate Quarry,1,0.00
Chase Gormley,1,0.00
Nandor Guelmino,1,0.00
