In [2]:
import pandas as pd
import os
from pandas import DataFrame
from typing import List
from src.scrape.utils import load_yaml
from src.transform.results_utils import wide_to_long_results
from src.transform.utils import add_all_cumsum_columns, subset_most_recent_fight

In [3]:
def read_data(config: dict) -> pd.DataFrame:

    df_results_clean = pd.read_parquet(config['output_files']['clean']['results'])
    df_fighters_clean = pd.read_parquet(config['output_files']['clean']['fighters'])
    df_events_clean = pd.read_parquet(config['output_files']['clean']['events'])

    df_results_long = df_results_clean.pipe(wide_to_long_results)
    df = (
        df_results_long
        .merge(df_fighters_clean, on='fighter_url')
        .merge(df_events_clean, on='event_url')
        .sort_values(by=['date', 'fight_url', 'fighter_url'])
        .reset_index(drop=True)
        )
    return df


In [4]:
df_clean = read_data(config=load_yaml(os.path.join( 'config', 'config.yaml')))

In [5]:
df_clean = df_clean.assign(
    result_method = lambda x : x['result'].str.lower() + '_' + x['method_type'].str.lower()
)

In [14]:
df_clean[['round', 'time', 'time_seconds', 'fight_duration_seconds']]

Unnamed: 0,round,time,time_seconds,fight_duration_seconds
0,1,1:17,77.0,77.0
1,1,1:17,77.0,77.0
2,1,9:51,591.0,591.0
3,1,9:51,591.0,591.0
4,1,2:50,170.0,170.0
...,...,...,...,...
16043,3,5:00,300.0,900.0
16044,5,5:00,300.0,1500.0
16045,5,5:00,300.0,1500.0
16046,3,5:00,300.0,900.0


In [13]:
df_clean.columns

Index(['fight_url', 'event_url', 'weight_class', 'method', 'round', 'time',
       'title_fight', 'perf_bonus', 'fight_of_the_night', 'time_seconds',
       'fight_duration_seconds', 'method_short', 'method_detail',
       'result_type', 'method_type', 'fighter_url', 'result', 'role',
       'opp_url', 'full_name', 'nickname', 'height', 'reach', 'stance',
       'date_of_birth', 'record', 'height_cm', 'reach_cm', 'event', 'date',
       'location', 'city', 'state', 'country', 'result_method'],
      dtype='object')

In [6]:
df = add_all_cumsum_columns(
    df=df_clean,
    dummy_cols=['result', 'result_method', 'weight_class'],
    numerical_cols=['title_fight', 'perf_bonus', 'fight_of_the_night', 'fight_duration_seconds',],
    group_col='fighter_url',
    row_count_col='total_fights'
)

In [7]:
df

Unnamed: 0,fight_url,event_url,weight_class,method,round,time,title_fight,perf_bonus,fight_of_the_night,time_seconds,...,total_welterweight,total_womens_bantamweight,total_womens_featherweight,total_womens_flyweight,total_womens_strawweight,total_title_fight,total_perf_bonus,total_fight_of_the_night,total_fight_duration_seconds,total_fights
0,http://ufcstats.com/fight-details/00835554f95f...,http://ufcstats.com/event-details/a6a9ab5a824e...,Open Weight,KO/TKO\n\n \n\n Punches,1,1:17,True,False,False,77.0,...,0,0,0,0,0,1,0,0,77.0,1
1,http://ufcstats.com/fight-details/00835554f95f...,http://ufcstats.com/event-details/a6a9ab5a824e...,Open Weight,KO/TKO\n\n \n\n Punches,1,1:17,True,False,False,77.0,...,0,0,0,0,0,1,0,0,77.0,1
2,http://ufcstats.com/fight-details/17ee4caf0698...,http://ufcstats.com/event-details/a6a9ab5a824e...,Open Weight,SUB\n\n \n\n Ezekiel Choke,1,9:51,False,False,False,591.0,...,0,0,0,0,0,0,0,0,591.0,1
3,http://ufcstats.com/fight-details/17ee4caf0698...,http://ufcstats.com/event-details/a6a9ab5a824e...,Open Weight,SUB\n\n \n\n Ezekiel Choke,1,9:51,False,False,False,591.0,...,0,0,0,0,0,0,0,0,591.0,1
4,http://ufcstats.com/fight-details/3b020d4914b4...,http://ufcstats.com/event-details/a6a9ab5a824e...,Open Weight,KO/TKO,1,2:50,False,False,False,170.0,...,0,0,0,0,0,0,0,0,170.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16043,http://ufcstats.com/fight-details/c7fadbdfdc14...,http://ufcstats.com/event-details/e6015889f500...,Bantamweight,U-DEC,3,5:00,False,False,False,300.0,...,0,0,0,0,0,0,0,0,900.0,1
16044,http://ufcstats.com/fight-details/d13849f49f99...,http://ufcstats.com/event-details/e6015889f500...,Middleweight,U-DEC,5,5:00,True,False,False,300.0,...,0,0,0,0,0,3,2,2,6549.0,9
16045,http://ufcstats.com/fight-details/d13849f49f99...,http://ufcstats.com/event-details/e6015889f500...,Middleweight,U-DEC,5,5:00,True,False,False,300.0,...,8,0,0,0,0,3,4,1,21728.0,23
16046,http://ufcstats.com/fight-details/de85ef2fd476...,http://ufcstats.com/event-details/e6015889f500...,Light Heavyweight,M-DEC,3,5:00,False,False,False,300.0,...,0,0,0,0,0,0,0,1,1414.0,2


In [8]:
df_current = subset_most_recent_fight(df=df, fighter_col='fighter_url', date_col='date')

In [9]:
df_current.head().to_dict(orient='records')

[{'fight_url': 'http://ufcstats.com/fight-details/0e3af9b5e1b3ae80',
  'event_url': 'http://ufcstats.com/event-details/fc9a9559a05f2704',
  'weight_class': 'Bantamweight',
  'method': 'U-DEC',
  'round': 3,
  'time': '5:00',
  'title_fight': False,
  'perf_bonus': False,
  'fight_of_the_night': True,
  'time_seconds': 300.0,
  'fight_duration_seconds': 900.0,
  'method_short': 'U-DEC',
  'method_detail': None,
  'result_type': 'Win',
  'method_type': 'Decision',
  'fighter_url': 'http://ufcstats.com/fighter-details/002ca196477ce572',
  'result': 'Loss',
  'role': 'fighter2',
  'opp_url': 'http://ufcstats.com/fighter-details/60425d07ef4b91a7',
  'full_name': 'Gabriel Silva',
  'nickname': '',
  'height': '5\' 6"',
  'reach': '71"',
  'stance': 'Orthodox',
  'date_of_birth': Timestamp('1994-08-26 00:00:00'),
  'record': '8-2-0',
  'height_cm': 167.64000000000001,
  'reach_cm': 180.34,
  'event': 'UFC Fight Night: Benavidez vs. Figueiredo',
  'date': Timestamp('2020-02-29 00:00:00'),
  'l