#### Goal: Make model to predict winner of UFC fights using this data
- XGBoost
- Target Feature: **`winner`**
- Classification (Winner or Loser)

In [1]:
import pandas as pd

- Not super important right now, this is just to indicate what columns will need to be dropped from modeling dataset

In [2]:
df_clean: pd.DataFrame = pd.read_csv('../data/ufc-clean.csv')

columns: list[str,...] = list(df_clean.columns)
dif_columns: list[str,...] = [column for column in columns if 'dif' in column]
betting_columns: list[str,...] = [column for column in columns if 'odds' in column or 'ev' in column]
rank_columns: list[str,...] = [column for column in columns if 'rank' in column]
info_columns: list[str,...] = ['date', 'location', 'country',]

drop_columns: list[str,...] = betting_columns + rank_columns + info_columns + dif_columns

In [9]:
# Ideally would like to get to this point where using ALL features that have not been dropped
# Until then, going to be setting basic_features as the features
features: list[str,...] = [
    column for column in list(df_clean.columns) 
    if column not in drop_columns
]

In [4]:
basic_features: list[str,...] = [
    'r_fighter',
    'b_fighter',
    'title_bout',
    'weight_class',
    'gender',
    'r_stance',
    'b_stance',
    'r_reach_cms',
    'b_reach_cms',
    'r_height_cms',
    'b_height_cms',
    'r_weight_lbs',
    'b_weight_lbs',
    'r_age',
    'b_age',
]
    
#     # Target
#     'winner'
# ]

#### Customizing Dataset
- Avoiding the word *transforming* so as not to be confused with sklearn transformers
- Goal is to have each fighter in own row and trying to determine winner
- Going to expand each row into two rows, one per fighter
- Will change `winner` to be either 0 or 1
- ***Going to start with just `basic_features`***

In [5]:
def customize_dataframe(df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
    
#     As of right now, setting the default features to basic_features, can pass in subset as list if would like
    features: list[str,...] = kwargs.get('features', basic_features)
    target: str = kwargs.get('target', 'winner')
        
    df = df.loc[:, features + [target]]
    
    # Important info relevant to fight/both fighters
    info_features = ['title_bout', 'weight_class', 'gender']
    
    # Abstract to accomodate any number of possible r_ or b_ features
    r_features: list[str,...] = [feature for feature in features if feature[:2] == 'r_']
    b_features: list[str,...] = [feature for feature in features if feature[:2] == 'b_']
    
    # Check to make sure have columns for both
    assert(len(r_features) == len(b_features))
    
    root_features: list[str,...] = [feature.replace('r_','') for feature in r_features]
    
    
#     Column order for transformed dataframe
    feature_order = sum([
        ['fighter', 'opp_fighter'],
        info_features,
        root_features[1:],
        [f'opp_{feature}' for feature in root_features[1:]],
        ['winner']
    ],[])
    
    frames = list()
    # Actual process
    for _, row in df.iterrows():
        
        # Before iterating, maybe set r_winner , b_winner as 0 or 1 as different way to keep track of winner??
        general_info = {
            **{feature: [row[feature]] * 2 for feature in info_features},
            **{'winner': [1,0] if row['winner'] == 'Red' else [0,1]}
        }
        
        red_info = dict()
        blue_info = dict()
        
        for feature in r_features:
            red_info[feature.replace('r_', '')] = row[feature]
            blue_info[feature.replace('r_', 'opp_')] = row[feature]
        
        for feature in b_features:
            blue_info[feature.replace('b_', '')] = row[feature]
            red_info[feature.replace('b_', 'opp_')] = row[feature]
            
        # TODO: Write some tests to prove this is true --> assert(red val == blue opp_val)...
        
        # Make sure red_info stays first so winner lines up
        data = {
            **general_info,
            **{feature: [red_info[feature], blue_info[feature]] for feature in feature_order if feature not in general_info}
        }
        
        frames.append(pd.DataFrame(data))
    
    return (pd
            .concat(frames)
            .loc[:, feature_order]
            .reset_index(drop=True)
           )
    
    

In [6]:
df_model: pd.DataFrame = customize_dataframe(df_clean)

In [7]:
df_model.to_csv('../data/model_dataset.csv', index=False)

- As one can see, this is a much more understandable dataset from a modeling perspective
- In the next notebook, I will be transforming this dataset into an optimal form for XGBoost

In [22]:
df_model.sample(10)

Unnamed: 0,fighter,opp_fighter,title_bout,weight_class,gender,stance,reach_cms,height_cms,weight_lbs,age,opp_stance,opp_reach_cms,opp_height_cms,opp_weight_lbs,opp_age,winner
9080,Johny Hendricks,Mike Pierce,0,Welterweight,MALE,Southpaw,175.26,175.26,185,27,Orthodox,177.8,172.72,170,30,1
3865,Cody Stamann,Tom Duquesnoy,0,Bantamweight,MALE,Orthodox,162.56,167.64,135,27,Orthodox,172.72,170.18,135,24,1
3339,Dan Hooker,Jim Miller,0,Lightweight,MALE,Switch,190.5,182.88,155,28,Southpaw,180.34,172.72,155,34,1
7526,Al Iaquinta,Piotr Hallmann,0,Lightweight,MALE,Orthodox,177.8,177.8,155,26,Orthodox,180.34,175.26,155,26,1
4966,Tim Boetsch,Josh Samman,0,Middleweight,MALE,Orthodox,187.96,182.88,185,35,Orthodox,200.66,190.5,185,28,1
5847,Tony Ferguson,Josh Thomson,0,Lightweight,MALE,Orthodox,193.04,180.34,155,31,Orthodox,180.34,177.8,155,36,1
5570,Efrain Escudero,Leandro Silva,0,Lightweight,MALE,Orthodox,177.8,175.26,155,29,Orthodox,182.88,175.26,155,30,0
5434,Michael McDonald,Masanori Kanehara,0,Bantamweight,MALE,Orthodox,177.8,175.26,135,24,Orthodox,170.18,170.18,135,33,1
4147,Ross Pearson,Dan Hooker,0,Lightweight,MALE,Orthodox,175.26,172.72,155,32,Switch,190.5,182.88,155,27,0
5597,Akbarh Arreola,Jake Matthews,0,Lightweight,MALE,Southpaw,180.34,177.8,155,32,Orthodox,185.42,180.34,170,21,0


In [27]:
(df_model
 .groupby('fighter')
 ['winner']
 .agg(['count', 'mean'])
 .sort_values('count', ascending=False)
 .set_axis(['num-fights', 'win %'], axis=1)
 .round(2)
)

Unnamed: 0_level_0,num-fights,win %
fighter,Unnamed: 1_level_1,Unnamed: 2_level_1
Donald Cerrone,36,0.64
Jim Miller,30,0.53
Charles Oliveira,27,0.70
Edson Barboza,26,0.62
Rafael Dos Anjos,26,0.65
...,...,...
Charlie Valencia,1,0.00
Nate Quarry,1,0.00
Chase Gormley,1,0.00
Nandor Guelmino,1,0.00
