In [16]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

from collections import defaultdict

import sys
sys.path.append(r'C:\Users\jcmar\my_files\SportsBetting\FeatureEngineering\ufc_features')
sys.path.append(r'C:\Users\jcmar\my_files\SportsBetting\FeatureEngineering')

from ufc_features import single_event_features, apply_rolling_stats, non_rolling_stats, upcoming_event_features

In [15]:
test_csv = pd.read_csv(r'C:\Users\jcmar\my_files\SportsBetting\data\all_stats_v2.csv')


In [18]:
def standardize_dates(stats, odds):
    for i, row in stats.iterrows():

        #find all matchups between red and blue fighter in odds
        mask = (odds['red_clean'] == row['red_clean']) & (odds['blue_clean'] == row['blue_clean']) 

        # if odds contains the stats matchup 
        if mask.any():
            for i in odds[mask].index:

                # iterate and find dates of each red/blue matchup 
                target_date = odds.at[i, 'date']

                # check if current stats date matches with +- odds date 
                if (row['date'] - pd.Timedelta(days=1) == target_date) or (row['date'] + pd.Timedelta(days=1) == target_date):

                    # if found, replace odds_date with stats date for standardization 
                    odds.at[i, 'date'] = row['date']
    return odds 

def clean_col(col):
    col = col.str.lower() \
        .str.replace('-', ' ', regex=False) \
        .str.replace('.', '', regex=False) \
        .str.replace("'", '', regex=False) \
        .str.replace(r'\bsaint\b', 'st', case=False, regex=True)
    return col 

def standardized_merge(stats_df, odds_df):
    odds_df = odds_df.loc[:, ~odds_df.columns.str.contains('^Unnamed')]

    stats = stats_df.reset_index(drop=True).copy()
    odds = odds_df.reset_index(drop=True).copy()
    
    odds['date'] = pd.to_datetime(odds['event_date'])
    stats['date'] = pd.to_datetime(stats['event_date'])

    odds = odds.dropna(subset=['date'])

    stats['red_clean'] = clean_col(stats['red_fighter'])
    stats['blue_clean'] = clean_col(stats['blue_fighter'])

    stats['red_fighter_stats'] = stats['red_fighter']
    stats['blue_fighter_stats'] = stats['blue_fighter']

    odds['red_clean'] = clean_col(odds['red_fighter'])
    odds['blue_clean'] = clean_col(odds['blue_fighter'])

    odds['red_fighter_odds'] = odds['red_fighter']
    odds['blue_fighter_odds'] = odds['blue_fighter']

    
    odds = standardize_dates(stats, odds)

    # print(odds.groupby(['red_clean', 'blue_clean','date']).size().reset_index(name='count').query('count>1'))    
    # merge odds onto stats after standardizing names and dates to match stats 
    new_df = pd.merge(stats, odds, on=['red_clean', 'blue_clean','date'], how='left')

    new_df = new_df.loc[:, ~new_df.columns.str.contains('^Unnamed')]

    new_df = new_df.rename(columns={'red_clean': 'red_fighter', 'blue_clean':'blue_fighter'})

    new_df = new_df.drop(columns=['red_fighter_x','red_fighter_y','blue_fighter_x','blue_fighter_y','event_date_y','event_date_x', 
                                  'og_red_fighter', 'og_blue_name'])
    
    new_df = new_df.drop_duplicates().reset_index(drop=True)
    return new_df

def stats_preprocessing(stats_df):
    
    stats_df = stats_df.loc[:, ~stats_df.columns.str.contains('^Unnamed')]
    past_event_stats = single_event_features(stats_df.copy())
    # past_event_stats.to_csv(r'C:\Users\jcmar\my_files\SportsBetting\data\ufc_singe_event_features.csv', save_index=False)

    return past_event_stats

def apply_rolling(df):
    rolling_df = apply_rolling_stats(df)

    return rolling_df

def merge_with_odds(odds_df, stats_df):
    merged_df = standardized_merge(stats_df, odds_df)
    return merged_df

test_pre = stats_preprocessing(test_csv)
test_rolling = apply_rolling(test_pre)

final_test = non_rolling_stats(test_rolling)

odds_df = pd.read_csv(r'C:\Users\jcmar\my_files\SportsBetting\data\entire_odds_history.csv')
odds_stats = merge_with_odds(odds_df, test_rolling)


OLD Index(['control_pr_red', 'td_pr_red', 'sub_att_pr_red', 'sigstrikes_pm_red',
       'head_strikes_pm_red', 'leg_strikes_pm_red', 'body_strikes_pm_red',
       'clinch_strikes_pm_red', 'ground_strikes_pm_red', 'kd_pr_red',
       'reverse_pr_red', 'sigstrikes_absorbed_pm_red', 'red_kd',
       'red_sig_str_landed', 'red_td_landed', 'red_td_attempted',
       'red_sig_str_attempted', 'control_pr_blue', 'td_pr_blue',
       'sub_att_pr_blue', 'sigstrikes_pm_blue', 'head_strikes_pm_blue',
       'leg_strikes_pm_blue', 'body_strikes_pm_blue', 'clinch_strikes_pm_blue',
       'ground_strikes_pm_blue', 'kd_pr_blue', 'reverse_pr_blue',
       'sigstrikes_absorbed_pm_blue', 'blue_kd', 'blue_sig_str_landed',
       'blue_sig_str_attempted', 'blue_td_landed', 'blue_td_attempted',
       'weight_class', 'reach_blue', 'reach_red', 'height_red', 'height_blue',
       'red_age', 'blue_age', 'event_country', 'event_age', 'event_date',
       'winner_name', 'performance_bonus_winner', 'fight_otn_bo

  df = df.replace({None: np.nan})


In [27]:
duplicates = odds_stats.groupby(['red_fighter', 'blue_fighter','date']).size().reset_index(name='count').query('count>1')
print(duplicates)
dup_keys = duplicates[['red_fighter', 'blue_fighter', 'date']]

duplicate_rows = odds_stats.merge(
    dup_keys,
    on=['red_fighter', 'blue_fighter', 'date'],
    how='inner'
)

for name, group in duplicate_rows.groupby(['red_fighter', 'blue_fighter', 'date']):
    print(f"\nGroup: {name}")
    # print(group)

    group_no_index = group.drop(columns=['red_fighter', 'blue_fighter', 'date'])
    
    # Find which columns have more than one unique value
    diff_cols = group_no_index.nunique()[group_no_index.nunique() > 1].index.tolist()

    if len(diff_cols) == 0:
        print("✅ All rows in this group are exactly the same.")
    else:
        print("⚠️ These columns differ between rows:", diff_cols)

           red_fighter        blue_fighter       date  count
1239     chris barnett        martin buday 2022-04-16      2
1595    curtis blaydes       rizvan kuniev 2025-06-21      2
2884  hamdy abdelwahab      mohammed usman 2025-06-21      2
3020   irina alekseeva      klaudia sygula 2025-06-21      2
5146      max holloway      dustin poirier 2025-07-19      2
6030     rafael fiziev  ignacio bahamondes 2025-06-21      2
6379  robert whittaker   reinier de ridder 2025-07-26      2
6735      seokhyeon ko        oban elliott 2025-06-21      2
6982   tagir ulanbekov         azat maksum 2025-06-21      2
7284     tofiq musayev    myktybek orolbai 2025-06-21      2

Group: ('chris barnett', 'martin buday', Timestamp('2022-04-16 00:00:00'))
⚠️ These columns differ between rows: ['open_blue', 'close1_blue', 'close2_blue', 'open_red', 'close1_red', 'close2_red']

Group: ('curtis blaydes', 'rizvan kuniev', Timestamp('2025-06-21 00:00:00'))
⚠️ These columns differ between rows: ['close1_blue',

In [28]:
result = odds_stats[odds_stats['blue_fighter'].str.contains('Poirier', case=False, na=False)]

result.tail()

Unnamed: 0,control_pr_red,td_pr_red,sub_att_pr_red,sigstrikes_pm_red,head_strikes_pm_red,leg_strikes_pm_red,body_strikes_pm_red,clinch_strikes_pm_red,ground_strikes_pm_red,kd_pr_red,...,red_fighter_stats,blue_fighter_stats,open_blue,close1_blue,close2_blue,open_red,close1_red,close2_red,red_fighter_odds,blue_fighter_odds
4865,147.122727,1.510606,0.162121,4.044848,3.558788,0.186364,0.299697,0.159091,2.500909,0.121212,...,Khabib Nurmagomedov,Dustin Poirier,205.0,315.0,350.0,-240.0,-455.0,-400.0,Khabib Nurmagomedov,Dustin Poirier
5945,49.363095,0.625,0.684524,2.095238,1.167857,0.415476,0.511905,0.375,0.425,0.119048,...,Charles Oliveira,Dustin Poirier,-165.0,-143.0,-130.0,145.0,110.0,120.0,Charles Oliveira,Dustin Poirier
7194,107.465556,0.825556,0.372222,2.053111,1.416,0.108,0.529111,0.402,0.693333,0.188889,...,Islam Makhachev,Dustin Poirier,280.0,300.0,350.0,-400.0,-500.0,-400.0,Islam Makhachev,Dustin Poirier
7775,15.903333,0.072778,0.102222,6.394667,4.268889,0.617,1.508778,0.534,0.403222,0.156667,...,Max Holloway,Dustin Poirier,110.0,-120.0,100.0,-130.0,-127.0,-108.0,Max Holloway,Dustin Poirier
7776,15.903333,0.072778,0.102222,6.394667,4.268889,0.617,1.508778,0.534,0.403222,0.156667,...,Max Holloway,Dustin Poirier,110.0,-110.0,-102.0,-130.0,-122.0,-110.0,Max Holloway,Dustin Poirier
