#### Goal: Make model to predict winner of UFC fights using this data
- XGBoost
- Target Feature: **`winner`**
- Classification (Winner or Loser)

In [1]:
import pandas as pd

- Not super important right now, this is just to indicate what columns will need to be dropped from modeling dataset

In [2]:
df_clean: pd.DataFrame = pd.read_csv('../data/ufc-clean.csv')

columns: list[str,...] = list(df_clean.columns)
dif_columns: list[str,...] = [column for column in columns if 'dif' in column]
betting_columns: list[str,...] = [column for column in columns if 'odds' in column or 'ev' in column]
rank_columns: list[str,...] = [column for column in columns if 'rank' in column]
info_columns: list[str,...] = ['date', 'location', 'country',]

drop_columns: list[str,...] = betting_columns + rank_columns + info_columns + dif_columns

In [9]:
# Ideally would like to get to this point where using ALL features that have not been dropped
# Until then, going to be setting basic_features as the features
features: list[str,...] = [
    column for column in list(df_clean.columns) 
    if column not in drop_columns
]

In [4]:
basic_features: list[str,...] = [
    'r_fighter',
    'b_fighter',
    'title_bout',
    'weight_class',
    'gender',
    'r_stance',
    'b_stance',
    'r_reach_cms',
    'b_reach_cms',
    'r_height_cms',
    'b_height_cms',
    'r_weight_lbs',
    'b_weight_lbs',
    'r_age',
    'b_age',
]
    
#     # Target
#     'winner'
# ]

#### Customizing Dataset
- Avoiding the word *transforming* so as not to be confused with sklearn transformers
- Goal is to have each fighter in own row and trying to determine winner
- Going to expand each row into two rows, one per fighter
- Will change `winner` to be either 0 or 1
- ***Going to start with just `basic_features`***

In [5]:
def customize_dataframe(df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
    
#     As of right now, setting the default features to basic_features, can pass in subset as list if would like
    features: list[str,...] = kwargs.get('features', basic_features)
    target: str = kwargs.get('target', 'winner')
        
    df = df.loc[:, features + [target]]
    
    # Important info relevant to fight/both fighters
    info_features = ['title_bout', 'weight_class', 'gender']
    
    # Abstract to accomodate any number of possible r_ or b_ features
    r_features: list[str,...] = [feature for feature in features if feature[:2] == 'r_']
    b_features: list[str,...] = [feature for feature in features if feature[:2] == 'b_']
    
    # Check to make sure have columns for both
    assert(len(r_features) == len(b_features))
    
    root_features: list[str,...] = [feature.replace('r_','') for feature in r_features]
    
    
#     Column order for transformed dataframe
    feature_order = sum([
        ['fighter', 'opp_fighter'],
        info_features,
        root_features[1:],
        [f'opp_{feature}' for feature in root_features[1:]],
        ['winner']
    ],[])
    
    frames = list()
    # Actual process
    for index, row in df.iterrows():
        
        # Before iterating, maybe set r_winner , b_winner as 0 or 1 as different way to keep track of winner??
        general_info = {
            **{feature: [row[feature]] * 2 for feature in info_features},
            **{'winner': [1,0] if row['winner'] == 'Red' else [0,1]}
        }
        
        red_info = dict()
        blue_info = dict()
        
        for feature in r_features:
            red_info[feature.replace('r_', '')] = row[feature]
            blue_info[feature.replace('r_', 'opp_')] = row[feature]
        
        for feature in b_features:
            blue_info[feature.replace('b_', '')] = row[feature]
            red_info[feature.replace('b_', 'opp_')] = row[feature]
            
        # TODO: Write some tests to prove this is true --> assert(red val == blue opp_val)...
        
        # Make sure red_info stays first so winner lines up
        data = {
            **general_info,
            **{feature: [red_info[feature], blue_info[feature]] for feature in feature_order if feature not in general_info}
        }
        
        frames.append(pd.DataFrame(data))
    
    return (pd
            .concat(frames)
            .loc[:, feature_order]
            .reset_index(drop=True)
           )
    
    

In [6]:
df_model: pd.DataFrame = customize_dataframe(df_clean)

In [7]:
df_model.to_csv('../data/model_dataset.csv', index=False)

- As one can see, this is a much more understandable dataset from a modeling perspective
- In the next notebook, I will be transforming this dataset into an optimal form for XGBoost

In [10]:
df_model.head(10)

Unnamed: 0,fighter,opp_fighter,title_bout,weight_class,gender,stance,reach_cms,height_cms,weight_lbs,age,opp_stance,opp_reach_cms,opp_height_cms,opp_weight_lbs,opp_age,winner
0,Thiago Santos,Johnny Walker,0,Light Heavyweight,MALE,Orthodox,193.04,187.96,205,37,Orthodox,208.28,198.12,205,29,1
1,Johnny Walker,Thiago Santos,0,Light Heavyweight,MALE,Orthodox,208.28,198.12,205,29,Orthodox,193.04,187.96,205,37,0
2,Alex Oliveira,Niko Price,0,Welterweight,MALE,Orthodox,193.04,180.34,170,33,Orthodox,193.04,182.88,170,32,0
3,Niko Price,Alex Oliveira,0,Welterweight,MALE,Orthodox,193.04,182.88,170,32,Orthodox,193.04,180.34,170,33,1
4,Misha Cirkunov,Krzysztof Jotko,0,Middleweight,MALE,Orthodox,195.58,190.5,205,34,Southpaw,195.58,185.42,185,32,0
5,Krzysztof Jotko,Misha Cirkunov,0,Middleweight,MALE,Southpaw,195.58,185.42,185,32,Orthodox,195.58,190.5,205,34,1
6,Alexander Hernandez,Mike Breeden,0,Lightweight,MALE,Orthodox,182.88,175.26,155,29,Orthodox,177.8,177.8,155,32,1
7,Mike Breeden,Alexander Hernandez,0,Lightweight,MALE,Orthodox,177.8,177.8,155,32,Orthodox,182.88,175.26,155,29,0
8,Joe Solecki,Jared Gordon,0,Lightweight,MALE,Orthodox,177.8,175.26,155,28,Orthodox,172.72,175.26,145,33,0
9,Jared Gordon,Joe Solecki,0,Lightweight,MALE,Orthodox,172.72,175.26,145,33,Orthodox,177.8,175.26,155,28,1
