In [10]:
import pandas as pd
import os
from pandas import DataFrame
from typing import List
from google.cloud import storage
from pipeline.src.utils import load_yaml, load_parquet_from_gcs
from pipeline.src.transform.results_utils import wide_to_long_results
from pipeline.src.utils import load_yaml, load_parquet_from_gcs
from pipeline.src.transform.results_utils import wide_to_long_results
from pipeline.src.transform.utils import add_all_cumsum_columns, subset_most_recent_fight



In [11]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'gcs-key.json' 

In [21]:
def read_data(config: dict) -> pd.DataFrame:
    df_results_clean = load_parquet_from_gcs(
        blob_name=config['output_files']['clean']['results'],
        bucket_name=config['gcs']['bucket'])
    df_fighters_clean = load_parquet_from_gcs(
        blob_name=config['output_files']['clean']['fighters'],
        bucket_name=config['gcs']['bucket'])
    df_events_clean = load_parquet_from_gcs(
        blob_name=config['output_files']['clean']['events'],
        bucket_name=config['gcs']['bucket'])
    df_fighters_clean_opp = (
        df_fighters_clean[['fighter_url', 'full_name']]
        .rename(columns={"fighter_url": "opp_url", "full_name": "opp_full_name"}))
    df_results_long = df_results_clean.pipe(wide_to_long_results)
    df = (
        df_results_long
        .merge(df_fighters_clean, on='fighter_url')
        .merge(df_fighters_clean_opp, on='opp_url')
        .merge(df_events_clean, on='event_url')
        .sort_values(by=['date', 'fight_url', 'fighter_url'])
        .reset_index(drop=True)
    )
    return df

In [22]:
config = load_yaml(os.path.join('pipeline', 'config', 'config.yaml'))

In [23]:
# Full DataFrame with computed stats
df = (
    read_data(config=load_yaml(os.path.join('pipeline', 'config', 'config.yaml')))
    .assign(result_method=lambda x: x['result'].str.lower() + '_' + x['method_type'].str.lower())
    .pipe(
        add_all_cumsum_columns,
        dummy_cols=['result', 'result_method', 'weight_class'],
        numerical_cols=['title_fight', 'perf_bonus', 'fight_of_the_night', 'fight_duration_seconds'],
        group_col='fighter_url',
        row_count_col='total_fights'
    )
)


In [24]:
df.columns


Index(['fight_url', 'event_url', 'weight_class', 'method', 'round', 'time',
       'title_fight', 'perf_bonus', 'fight_of_the_night', 'time_seconds',
       'fight_duration_seconds', 'method_short', 'method_detail',
       'result_type', 'method_type', 'fighter_url', 'result', 'role',
       'opp_url', 'full_name', 'nickname', 'height', 'reach', 'stance',
       'date_of_birth', 'record', 'height_cm', 'reach_cm', 'opp_full_name',
       'event', 'date', 'location', 'city', 'state', 'country',
       'result_method', 'total_draw', 'total_loss', 'total_nc', 'total_win',
       'total_draw_decision', 'total_draw_other', 'total_loss_decision',
       'total_loss_knockout', 'total_loss_other', 'total_loss_submission',
       'total_nc_other', 'total_win_decision', 'total_win_knockout',
       'total_win_other', 'total_win_submission', 'total_bantamweight',
       'total_catch_weight', 'total_featherweight', 'total_flyweight',
       'total_heavyweight', 'total_light_heavyweight', 'total_lig

In [19]:
df.tail().to_dict(orient='records')

[{'fight_url': 'http://ufcstats.com/fight-details/ce1436a9b58902a7',
  'event_url': 'http://ufcstats.com/event-details/ce7871949b0ed2bf',
  'weight_class': 'Lightweight',
  'method': 'KO/TKO',
  'round': 1,
  'time': '5:00',
  'title_fight': False,
  'perf_bonus': False,
  'fight_of_the_night': False,
  'time_seconds': 300.0,
  'fight_duration_seconds': 300.0,
  'method_short': 'KO/TKO',
  'method_detail': None,
  'result_type': 'Win',
  'method_type': 'Knockout',
  'fighter_url': 'http://ufcstats.com/fighter-details/ff62013d2fce6d13',
  'result': 'Win',
  'role': 'fighter1',
  'opp_url': 'http://ufcstats.com/fighter-details/eb393afdbe3293d5',
  'full_name': 'Nazim Sadykhov',
  'nickname': 'Black Wolf',
  'height': '5\' 10"',
  'reach': '69"',
  'stance': 'Southpaw',
  'date_of_birth': Timestamp('1994-05-16 00:00:00'),
  'record': '10-1-1',
  'height_cm': 177.8,
  'reach_cm': 175.26,
  'event': 'UFC Fight Night: Cannonier vs. Rodrigues',
  'date': Timestamp('2025-02-15 00:00:00'),
  'l