#### Goal: Make model to predict winner of UFC fights using this data
- XGBoost
- Target Feature: **`winner`**
- Classification (Winner or Loser)

In [1]:
import pandas as pd

- Not super important right now, this is just to indicate what columns will need to be dropped from modeling dataset

In [2]:
df_clean: pd.DataFrame = pd.read_csv('../data/ufc-clean.csv')

columns: list[str,...] = list(df_clean.columns)
# columns

#### If goal is to predict outcomes of fights, cannot use any columns that include info from that fight
- Can only include info known before fight occurs
- Weight, reach, height, etc. --> *Good*
- Number of rounds, strikes landed, type of KO --> *Bad*

In [3]:
dif_columns: list[str,...] = [column for column in columns if 'dif' in column]
betting_columns: list[str,...] = [column for column in columns if 'odds' in column or 'ev' in column]
rank_columns: list[str,...] = [column for column in columns if 'rank' in column]
info_columns: list[str,...] = ['date', 'location', 'country',]

drop_columns: list[str,...] = betting_columns + rank_columns + info_columns + dif_columns

In [4]:
# Ideally would like to get to this point where using ALL features that have not been dropped
# Until then, going to be setting basic_features as the features
features: list[str,...] = [
    column for column in list(df_clean.columns) 
    if column not in drop_columns
]

In [5]:
basic_features: list[str,...] = [
    'r_fighter',
    'b_fighter',
    'title_bout',
    'weight_class',
    'gender',
    'r_stance',
    'b_stance',
    'r_reach_cms',
    'b_reach_cms',
    'r_height_cms',
    'b_height_cms',
    'r_weight_lbs',
    'b_weight_lbs',
    'r_age',
    'b_age',
    
    'r_wins',
    'b_wins',
    'r_losses',
    'b_losses',

    # Average actions
    'r_avg_sig_str_landed',
    'b_avg_sig_str_landed',
    'r_avg_sig_str_pct',
    'b_avg_sig_str_pct',
    'r_avg_sub_att',
    'b_avg_sub_att',
    'r_avg_td_landed',
    'b_avg_td_landed',
    'r_avg_td_pct',
    'b_avg_td_pct',
]
    
#     # Target
#     'winner'
# ]

#### Customizing Dataset
- Avoiding the word *transforming* so as not to be confused with sklearn transformers
- Goal is to have each fighter in own row and trying to determine winner
- Going to expand each row into two rows, one per fighter
- Will change `winner` to be either 0 or 1

In [18]:
def customize_dataframe(df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
    
#     As of right now, setting the default features to basic_features, can pass in subset as list if would like
    features: list[str,...] = kwargs.get('features', basic_features)
    target: str = kwargs.get('target', 'winner')
        
    df = df.loc[:, features + [target]]
    
    # Important info relevant to fight/both fighters
    info_features = ['title_bout', 'weight_class', 'gender']
    
    # Abstract to accomodate any number of possible r_ or b_ features
    r_features: list[str,...] = [feature for feature in features if feature[:2] == 'r_']
    b_features: list[str,...] = [feature for feature in features if feature[:2] == 'b_']
    
    # Check to make sure have columns for both
    assert(len(r_features) == len(b_features))

    # Be careful of fighter, opp_fighter
    root_features: list[str,...] = [feature[2:] for feature in r_features]
    
    
#     Column order for transformed dataframe
    feature_order = sum([
        ['fighter', 'opp_fighter'],
        info_features,
        root_features[1:],
        [f'opp_{feature}' for feature in root_features[1:]],
        ['winner']
    ],[])
    
    frames = list()
    
    # Actual process of customization
    for _, row in df.iterrows():
        
        # Before iterating,  set r_winner , b_winner as 0 or 1 as better way to keep track of winner
        # General info refers to all columns not specific to color (not containing 'r_' or 'b_')
        general_info = {
            **{feature: [row[feature]] * 2 for feature in info_features},
            **{'winner': [1,0] if row['winner'] == 'Red' else [0,1]}
        }
        
        red_info = dict()
        blue_info = dict()
        
        for feature in r_features:
            red_info[feature[2:]] = row[feature]

            blue_idx = 'opp_' + feature[2:]
            blue_info[blue_idx] = row[feature]
        
        for feature in b_features:
            blue_info[feature[2:]] = row[feature]

            red_idx = 'opp_' + feature[2:]
            red_info[red_idx] = row[feature]
            
        # TODO: Write some tests to prove this is true --> assert(red val == blue opp_val)...
        
        # Make sure red_info stays first so winner lines up
        data = {
            **general_info,
            **{feature: [red_info[feature], blue_info[feature]] for feature in feature_order if feature not in general_info}
        }
        
        frames.append(pd.DataFrame(data))

    clean_df: pd.DataFrame = (pd
                              .concat(frames)
                              # 4 rows have an issue with stances, just going to drop those since 4 / 9792 rows = 0.04%
                              .dropna()
                              .reset_index(drop=True)
                             )

    # Since only two gender classes, make this binary column
    clean_df['male'] = clean_df['gender'].map(lambda gender_: 1 if gender_ == 'MALE' else 0)
    # Drop original gender column since have binary column now, issues caused if do this later on
    clean_df = clean_df.drop('gender', axis=1)
    
    # Remove "Women's " from weight_class column for female fighters
    clean_df['weight_class'] = clean_df['weight_class'].map(lambda s: s.replace("Women's ", ""))

    # Clean up string columns in case of whitespace issues
    # Will cause issues if dealing with any other objects besides strings or any null values present
    for strcol in list(clean_df.select_dtypes(include='object').columns):
        clean_df[strcol] = clean_df[strcol].map(lambda str_: str_.strip())    

    # Simple feature engineering
    # Going to reduce all stats columns to the difference
    # Done this way rather than using '*_dif' columns to track red/blue
    # Example: diff_reach_cms = reach_cms - opp_reach_cms
    opposing_stats: list[str,...] = [feature for feature in root_features if feature not in ('fighter', 'stance')]

    for stat in opposing_stats:
        clean_df[f'diff_{stat}'] = clean_df[stat] - clean_df[f'opp_{stat}']
        clean_df.drop([stat, f'opp_{stat}'], axis=1, inplace=True)


    # Aesthetics
    first_cols: list[str,...] = ['fighter', 'opp_fighter']
    feature_order: list[str,...] = first_cols + [column for column in clean_df.columns if column not in first_cols + [target]] + [target]
    
    return (clean_df
            # Preferred order
            .loc[:, feature_order]
           )
    
    

In [19]:
df_model: pd.DataFrame = customize_dataframe(df_clean)

In [20]:
df_model.to_csv('../data/model_dataset.csv', index=False)

In [21]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9788 entries, 0 to 9787
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   fighter                  9788 non-null   object 
 1   opp_fighter              9788 non-null   object 
 2   title_bout               9788 non-null   bool   
 3   weight_class             9788 non-null   object 
 4   stance                   9788 non-null   object 
 5   opp_stance               9788 non-null   object 
 6   male                     9788 non-null   int64  
 7   diff_reach_cms           9788 non-null   float64
 8   diff_height_cms          9788 non-null   float64
 9   diff_weight_lbs          9788 non-null   int64  
 10  diff_age                 9788 non-null   int64  
 11  diff_wins                9788 non-null   int64  
 12  diff_losses              9788 non-null   int64  
 13  diff_avg_sig_str_landed  9788 non-null   float64
 14  diff_avg_sig_str_pct    

- As one can see, this is a much more understandable dataset from a modeling perspective
- In the next notebook, I will be transforming this dataset into an optimal form for XGBoost

In [22]:
df_model.sample(10)

Unnamed: 0,fighter,opp_fighter,title_bout,weight_class,stance,opp_stance,male,diff_reach_cms,diff_height_cms,diff_weight_lbs,diff_age,diff_wins,diff_losses,diff_avg_sig_str_landed,diff_avg_sig_str_pct,diff_avg_sub_att,diff_avg_td_landed,diff_avg_td_pct,winner
1964,Sara McMann,Lina Lansberg,False,Bantamweight,Orthodox,Orthodox,0,2.54,-2.54,0,2,5,2,0.0,-0.13,0.3,3.84,0.31,1
7412,Takeya Mizugaki,Nam Phan,False,Bantamweight,Orthodox,Orthodox,1,-5.08,2.54,0,-1,3,-2,-34.214286,0.056429,-0.02381,1.690476,0.390476,1
2881,Jonathan Martinez,Andre Soukhamthath,False,Bantamweight,Southpaw,Orthodox,1,0.0,-2.54,0,2,-1,-3,-33.5,-0.4,0.0,-1.0,-0.1425,0
8794,Jacob Volkmann,Efrain Escudero,False,Lightweight,Southpaw,Orthodox,1,2.54,0.0,0,6,1,0,-11.166667,0.009333,1.433333,1.066667,0.131,1
6905,Nate Marquardt,James Te Huna,False,Middleweight,Orthodox,Orthodox,1,-2.54,-5.08,0,3,5,3,-4.25,-0.030625,0.6875,0.3125,0.311875,1
4183,Misha Cirkunov,Volkan Oezdemir,False,Light Heavyweight,Orthodox,Orthodox,1,5.08,2.54,0,3,3,0,-55.0,0.105,0.75,2.25,0.56,0
200,Julio Arce,Andre Ewell,False,Bantamweight,Southpaw,Southpaw,1,-12.7,-2.54,10,-2,0,-1,-0.04,-0.05,0.4,0.25,-0.36,1
4745,Joachim Christensen,Henrique da Silva,False,Light Heavyweight,Orthodox,Orthodox,1,0.0,0.0,0,10,-1,0,-81.0,-0.59,0.0,0.0,0.0,0
852,Alex Morono,Rhys McKee,False,Welterweight,Orthodox,Orthodox,1,-15.24,-7.62,0,5,6,2,4.42,0.39,0.6,0.0,0.0,1
6383,Antonio Dos Santos,Daniel Sarafian,False,Middleweight,Orthodox,Orthodox,1,-2.54,2.54,0,-6,-1,-3,-17.5,-0.62,-0.5,-0.25,-0.25,0


In [11]:
df_model['weight_class'].value_counts()

Lightweight          1710
Welterweight         1660
Bantamweight         1298
Middleweight         1186
Featherweight        1166
Light Heavyweight     788
Heavyweight           768
Flyweight             740
Strawweight           406
Catch Weight           66
Name: weight_class, dtype: int64

In [12]:
df_model['stance'].value_counts() #If whitespace issues, there would be multiple entries (originally some entries of 'Switch ')

Orthodox       7391
Southpaw       1970
Switch          422
Open Stance       5
Name: stance, dtype: int64

In [13]:
(df_model
 .groupby('fighter')
 ['winner']
 .agg(['count', 'mean'])
 .set_axis(['num-fights', 'win %'], axis=1)
 .sort_values('num-fights', ascending=False)
 .round(2)
)

Unnamed: 0_level_0,num-fights,win %
fighter,Unnamed: 1_level_1,Unnamed: 2_level_1
Donald Cerrone,36,0.64
Jim Miller,30,0.53
Charles Oliveira,27,0.70
Rafael Dos Anjos,26,0.65
Demian Maia,26,0.62
...,...,...
Jimmy Flick,1,1.00
Jimmy Quinlan,1,0.00
Sako Chivitchian,1,0.00
Saidyokub Kakhramonov,1,1.00
