# Import libraries

In [1]:
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)

# Import datasets

In [2]:
fight_stats = pd.read_csv('ufc_fight_stats.csv')
fight_results = pd.read_csv('ufc_fight_results.csv')
fighter_stats = pd.read_csv('ufc_fighter_tott.csv')
fighter_links = pd.read_csv('ufc.csv')
fight_event_details = pd.read_csv('ufc_event_details.csv')

In [3]:
fight_stats.shape, fight_results.shape, fighter_stats.shape

((32534, 19), (6984, 11), (3929, 7))

# Functions

In [4]:
def create_data_dictionary(data_full):
    '''
    Function for getting the dataframe for data dictionary

    INPUT: dataframe
    OUTPUT: dataframe (with info for data dictionary)
    '''
    column_list = data_full.columns.tolist()
    # unique value count
    uni_val_count = []
    for i in column_list:
        uni_val_count.append(len(data_full[i].unique()))
    # unique value 
    uni_val = []
    for i in column_list:
        uni_val.append(data_full[i].unique()[0:6])
    # NULL count
    null_count = []
    for i in column_list:
        print(data_full[data_full[i].isnull()==True])
        null_count.append((data_full[data_full[i].isnull()==True]).shape[0])
    if len(column_list) == len(uni_val_count) == len(uni_val) == len(null_count):
        result_data = pd.DataFrame({'column': column_list,
                                    'Unique value count': uni_val_count,
                                    'Unique valuer': uni_val,
                                    'NULL count': null_count})
    else:
        result_data = 'ERROR!!'

    return result_data

In [5]:
def minute_to_second(time):
    '''
    INPUT: str? (12:32)
    OUTPUT: int (752)
    
    '''
    if ':' in str(time):
        time = str(time)
        time_splitted = time.split(':')
        result = int(time_splitted[0])*60 + int(time_splitted[1])
    else:
        result = np.nan

    return result

In [6]:
def height_to_inches(ht):
    # format: 7' 0.0"
    ht_ = ht.split("' ")
    if (len(ht_)== 2):
        ft_ = float(ht_[0])
        in_ = float(ht_[1].replace("\"",""))
        return (12*ft_) + in_

In [7]:
def opponent(x):
    bout_splitted = str(x['BOUT']).split(' vs. ')
    if bout_splitted[0] == x['FIGHTER']:
        return bout_splitted[1]
    else:
        return bout_splitted[0]

# fight_stats

In [8]:
fight_stats_copy = fight_stats.copy()
fight_stats_copy['EVENT'] = fight_stats_copy['EVENT'].map(lambda cell: re.sub("\s+", " ", cell.strip()))
fight_stats_copy['BOUT'] = fight_stats_copy['BOUT'].map(lambda cell: re.sub("\s+", " ", cell.strip()))

In [9]:
fight_stats_copy

Unnamed: 0,EVENT,BOUT,ROUND,FIGHTER,KD,SIG.STR.,SIG.STR. %,TOTAL STR.,TD,TD %,SUB.ATT,REV.,CTRL,HEAD,BODY,LEG,DISTANCE,CLINCH,GROUND
0,UFC Fight Night: Muniz vs. Allen,Andre Muniz vs. Brendan Allen,Round 1,Andre Muniz,0.0,16 of 40,40%,16 of 40,0 of 1,0%,0.0,0.0,0:07,7 of 28,5 of 7,4 of 5,16 of 39,0 of 1,0 of 0
1,UFC Fight Night: Muniz vs. Allen,Andre Muniz vs. Brendan Allen,Round 2,Andre Muniz,0.0,11 of 34,32%,13 of 36,0 of 2,0%,0.0,0.0,0:08,7 of 30,1 of 1,3 of 3,11 of 34,0 of 0,0 of 0
2,UFC Fight Night: Muniz vs. Allen,Andre Muniz vs. Brendan Allen,Round 3,Andre Muniz,0.0,16 of 38,42%,16 of 38,0 of 0,---,0.0,0.0,0:00,11 of 30,2 of 5,3 of 3,16 of 38,0 of 0,0 of 0
3,UFC Fight Night: Muniz vs. Allen,Andre Muniz vs. Brendan Allen,Round 1,Brendan Allen,0.0,19 of 47,40%,19 of 47,0 of 0,---,0.0,0.0,0:00,8 of 32,6 of 9,5 of 6,19 of 47,0 of 0,0 of 0
4,UFC Fight Night: Muniz vs. Allen,Andre Muniz vs. Brendan Allen,Round 2,Brendan Allen,0.0,12 of 28,42%,25 of 43,0 of 0,---,0.0,0.0,2:44,6 of 22,4 of 4,2 of 2,10 of 24,0 of 0,2 of 4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32529,UFC 2: No Way Out,Johnny Rhodes vs. David Levicki,Round 1,David Levicki,0.0,4 of 5,80%,95 of 102,0 of 0,---,0.0,0.0,--,4 of 5,0 of 0,0 of 0,1 of 2,2 of 2,1 of 1
32530,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,Round 1,Patrick Smith,0.0,1 of 1,100%,1 of 1,0 of 1,0%,1.0,0.0,--,0 of 0,1 of 1,0 of 0,0 of 0,1 of 1,0 of 0
32531,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,Round 1,Ray Wizard,0.0,1 of 1,100%,2 of 2,0 of 0,---,0.0,0.0,--,0 of 0,0 of 0,1 of 1,1 of 1,0 of 0,0 of 0
32532,UFC 2: No Way Out,Scott Morris vs. Sean Daugherty,Round 1,Scott Morris,0.0,1 of 1,100%,2 of 2,1 of 1,100%,1.0,0.0,--,1 of 1,0 of 0,0 of 0,0 of 0,1 of 1,0 of 0


In [10]:
# fixing the digits
columnslist = ['SIG.STR.','TOTAL STR.', 'TD', 'HEAD', 'BODY','LEG', 'DISTANCE', 'CLINCH', 'GROUND']
for strike in columnslist:
    fight_stats_copy[['ATTEMPTED ' + strike, 'LANDED ' + strike]] = fight_stats[strike].str.extract("(\d+).+(\d+)")

In [11]:
# dropping some columns
fight_stats_copy = fight_stats_copy.drop(columns = columnslist)

In [12]:
#changing time to seconds
fight_stats_copy['CTRL'] =  fight_stats_copy['CTRL'].apply(lambda x: minute_to_second(x))

# fight_result

In [13]:
fight_results_copy = fight_results.copy()

In [14]:
fight_results_copy.columns

Index(['EVENT', 'BOUT', 'OUTCOME', 'WEIGHTCLASS', 'METHOD', 'ROUND', 'TIME',
       'TIME FORMAT', 'REFEREE', 'DETAILS', 'URL'],
      dtype='object')

In [15]:
fight_results_copy['EVENT']

0       UFC Fight Night: Muniz vs. Allen 
1       UFC Fight Night: Muniz vs. Allen 
2       UFC Fight Night: Muniz vs. Allen 
3       UFC Fight Night: Muniz vs. Allen 
4       UFC Fight Night: Muniz vs. Allen 
                      ...                
6979                   UFC 2: No Way Out 
6980                   UFC 2: No Way Out 
6981                   UFC 2: No Way Out 
6982                   UFC 2: No Way Out 
6983                   UFC 2: No Way Out 
Name: EVENT, Length: 6984, dtype: object

In [16]:
fight_results_copy['EVENT'] = fight_results_copy['EVENT'].map(lambda cell: re.sub("\s+", " ", cell.strip()))
fight_results_copy['BOUT'] = fight_results_copy['BOUT'].map(lambda cell: re.sub("\s+", " ", cell.strip()))
fight_results_copy['URL'] = fight_results_copy['URL'].map(lambda cell: re.sub("\s+", " ", cell.strip()))

In [17]:
# renaming 
fight_results_copy = fight_results_copy.rename(columns={'ROUND': 'WIN ROUND', 'URL': 'Fight Result URL'})

In [18]:
# minute to second
fight_results_copy['TIME'] = fight_results_copy.apply(lambda x: minute_to_second(x['TIME']), axis = 1)

# fighter_stats

In [19]:
fighter_stats_copy = fighter_stats.copy()

In [20]:
# renaming
fighter_stats_copy = fighter_stats_copy.rename(columns={'URL': 'Fighter Stats URL'})

In [21]:
# changing some values
fighter_stats_copy['WEIGHT'] = fighter_stats_copy['WEIGHT'].replace('--', '0')
fighter_stats_copy['WEIGHT'] = fighter_stats_copy['WEIGHT'].apply(lambda x: str(x).replace(' lbs.', ''))

fighter_stats_copy['REACH'] = fighter_stats_copy['REACH'].replace('--', '0')
fighter_stats_copy['REACH'] = fighter_stats_copy['REACH'].apply(lambda x: str(x).replace('"', ''))

fighter_stats_copy['HEIGHT'] = fighter_stats_copy['HEIGHT'].replace('--', '0')
fighter_stats_copy['HEIGHT'] = fighter_stats_copy["HEIGHT"].apply(lambda x:height_to_inches(x))

# fight_links

In [22]:
# clean excess string parts

In [23]:
fighter_links_copy = fighter_links.copy()

In [24]:
# clean white space in merge columns

In [25]:
fighter_links_copy['fight'] = fighter_links_copy['fight'].apply(lambda x: x[5:-1])

In [26]:
fighter_links_copy['event'] = fighter_links_copy['event'].apply(lambda x: x[4:-5])

In [27]:
fighter_links_copy['fight'] = fighter_links_copy['fight'].map(lambda cell: re.sub("\s+", " ", cell.strip()))
fighter_links_copy['event'] = fighter_links_copy['event'].map(lambda cell: re.sub("\s+", " ", cell.strip()))

In [28]:
# rename columns to merge 

In [29]:
fighter_links_copy = fighter_links_copy.rename(columns={'event': 'EVENT'})

In [30]:
fighter_links_copy = fighter_links_copy.rename(columns={'fight': 'Fight Result URL'})

In [31]:
fighter_links_copy = fighter_links_copy.rename(columns={'fighter1': 'Fighter Stats URL1'})

In [32]:
fighter_links_copy = fighter_links_copy.rename(columns={'fighter2': 'Fighter Stats URL2'})

# fight_event_details

In [33]:
fight_event_details_copy = fight_event_details.copy()

In [34]:
#clean potential whitespace

In [35]:
fight_event_details_copy['EVENT'] = fight_event_details_copy['EVENT'].map(lambda cell: re.sub("\s+", " ", cell.strip()))
fight_event_details_copy['URL'] = fight_event_details_copy['URL'].map(lambda cell: re.sub("\s+", " ", cell.strip()))

In [36]:
fight_event_details_copy = fight_event_details_copy.rename(columns={'URL': 'Fight Event URL'})

# fight_stats + fighter_stats + fighter_details + fight_event_details + fight_details + fight_links

In [37]:
df2 = fighter_stats_copy

In [38]:
df2 = df2.rename(columns={'Fighter Stats URL': 'Fighter Stats URL1'})

In [39]:
df1 = fighter_links_copy

In [40]:
result = pd.merge(df1, df2)

In [41]:
df2 = df2.rename(columns = {'Fighter Stats URL1': 'Fighter Stats URL2'})

In [42]:
result = pd.merge(result, df2, on = 'Fighter Stats URL2')

In [43]:
fighter_stats_copy.columns

Index(['FIGHTER', 'HEIGHT', 'WEIGHT', 'REACH', 'STANCE', 'DOB',
       'Fighter Stats URL'],
      dtype='object')

In [44]:
event_merge = pd.merge(fight_stats_copy, fight_results_copy, on = ['EVENT', 'BOUT'], how = 'left')

In [45]:
event_merge2 = pd.merge(event_merge, fight_event_details_copy, on = ['EVENT'], how = 'left')

In [46]:
testdf = pd.merge(event_merge2, result, on = ['Fight Result URL','EVENT'], how='left')
testdf.loc[testdf['FIGHTER'] == testdf['FIGHTER_x'], 'Fighter Stats URL'] = testdf['Fighter Stats URL1']
testdf.loc[testdf['FIGHTER'] == testdf['FIGHTER_y'], 'Fighter Stats URL'] = testdf['Fighter Stats URL2']
xycolumns = ['FIGHTER_x', 'HEIGHT_x',
       'WEIGHT_x', 'REACH_x', 'STANCE_x', 'DOB_x', 'FIGHTER_y', 'HEIGHT_y',
       'WEIGHT_y', 'REACH_y', 'STANCE_y', 'DOB_y']
testdf = testdf.drop(xycolumns, axis =1)

In [47]:
testdf = pd.merge(testdf, fighter_stats_copy, on = ['Fighter Stats URL', 'FIGHTER'], how = 'left')

In [48]:
testdf[['EVENT', 'BOUT']].value_counts()

EVENT                                      BOUT                                 
UFC 263: Adesanya vs. Vettori 2            Israel Adesanya vs. Marvin Vettori       10
UFC 225: Whittaker vs. Romero 2            Robert Whittaker vs. Yoel Romero         10
UFC 223: Khabib vs. Iaquinta               Khabib Nurmagomedov vs. Al Iaquinta      10
                                           Rose Namajunas vs. Joanna Jedrzejczyk    10
UFC 118: Edgar vs Penn 2                   Frankie Edgar vs. BJ Penn                10
                                                                                    ..
UFC Fight Night 56: Shogun vs Saint Preux  Caio Magalhaes vs. Trevor Smith           2
                                           Leandro Silva vs. Charlie Brenneman       2
                                           Mauricio Rua vs. Ovince Saint Preux       2
UFC Fight Night 6                          Anthony Torres vs. Pat Healy              2
UFC: Silva vs Irvin                        Tim Cr

# Add OPPONENT stats

In [49]:
testdf['OPPONENT'] = testdf.apply(lambda x: opponent(x), axis = 1)

In [50]:
testdf = testdf.join(testdf.groupby(['EVENT','BOUT','FIGHTER', 'ROUND']).first(), on=['EVENT','BOUT', 'OPPONENT', 'ROUND'], rsuffix='opp')

In [51]:
columns = [ 'OUTCOMEopp',
       'WEIGHTCLASSopp', 'METHODopp', 'WIN ROUNDopp', 'TIMEopp',
       'TIME FORMATopp', 'REFEREEopp', 'DETAILSopp', 'Fight Result URLopp',
       'OPPONENTopp','Fighter Stats URL1', 'Fighter Stats URL2','Fight Event URLopp', 'DATEopp', 'LOCATIONopp',
       'Fighter Stats URL1opp', 'Fighter Stats URL2opp']

In [52]:
testdf = testdf.replace(['--', '---'], np.nan)

In [53]:
testdf = testdf.drop(columns, axis = 1)

In [54]:
#Clean Weight Reach and Height

In [55]:
testdf[['WEIGHT', 'REACH', 'HEIGHT', 'WEIGHTopp', 'REACHopp', 'HEIGHTopp']] = testdf[['WEIGHT', 'REACH', 'HEIGHT', 'WEIGHTopp', 'REACHopp', 'HEIGHTopp']].replace('0', np.nan)

In [56]:
#Clean Percentages

In [57]:
testdf = testdf.replace('%', '', regex = True)

In [58]:
testdf[['WEIGHT', 'REACH', 'HEIGHT', 'WEIGHTopp', 'REACHopp', 'HEIGHTopp']] = testdf[['WEIGHT', 'REACH', 'HEIGHT', 'WEIGHTopp', 'REACHopp', 'HEIGHTopp']].apply(lambda x: x.astype('float64'))

In [59]:
testdf.to_csv('data_cleaned2.csv', index = False)