<a href="https://colab.research.google.com/github/kamalzada/Machine-Learning-Projects/blob/master/stance_vs_tko_ratio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [763]:
# For web scraping
import requests
from bs4 import BeautifulSoup

# For data cleaning/feature engineering
import numpy as np
import pandas as pd

# For visualization
import seaborn as sns
import matplotlib.pyplot as plt

#For statistical tests
import pingouin as pg

#Miscellaneous
import string

In [780]:
df = pd.read_csv('fighter_stance.csv')

In [2]:
letters = string.ascii_lowercase
fighter_ids = {}

for letter in letters:
    pages = requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all')
    soup = BeautifulSoup(pages.text, 'lxml')   
    fighter_ids[letter] = soup.find_all('a', attrs = {'class': "b-link b-link_style_black"})

In [3]:
#Extract pure links to send request to
fighter_links = []

for letter in letters:
    for fighter in fighter_ids[letter]:
        fighter_links.append(fighter['href']) 

In [4]:
fighter_links = list(pd.Series(fighter_links).to_frame()[0].unique())

In [5]:
len(fighter_links)

3741

In [6]:
%%time
fighter_win = {}

for link in fighter_links:  
  pages = requests.get(link)
  soup = BeautifulSoup(pages.text, 'lxml')   
  fighter_win[link] = soup.find_all('p', attrs = {'class': "b-fight-details__table-text"}) 

CPU times: user 1min 30s, sys: 3.48 s, total: 1min 33s
Wall time: 19min 32s


In [781]:
win_loss = [[0 for columns in range(0)] for rows in range(len(fighter_win))] 

for key, value in enumerate(fighter_links):
  try:
    if fighter_win[value][0].text.split('\n')[1]=='next':
      for i in np.arange(6, len(fighter_win[value]), 17):
        win_loss[key].append(fighter_win[value][i].text.split('\n')[1]) 
    else:
      for j in np.arange(0, len(fighter_win[value]), 17):
        win_loss[key].append(fighter_win[value][j].text.split('\n')[1])
   
  except:
     win_loss[key].append('')     

In [782]:
method = [[0 for columns in range(0)] for rows in range(len(fighter_win))] 

for key, value in enumerate(fighter_links):
  try:
    if fighter_win[value][0].text.split('\n')[1]!='next':
      for i in np.arange (13, len(fighter_win[value]), 17):
        method[key].append(fighter_win[value][i].text.split('\n')[4][10:])
        

    elif fighter_win[value][0].text.split('\n')[1]=='next':
      for i in np.arange (19, len(fighter_win[value]), 17):
        try:
          method[key].append(fighter_win[value][i].text.split('\n')[4][10:])
        except:
          method[key].append('No info')
  except:
     method[key].append('No info')       

In [783]:
methods = [[0 for columns in range(0)] for rows in range(len(method))] 

for i in range(len(method)):
  for key, value in enumerate(win_loss[i]):
    if value=='win':
      methods[i].append(method[i][key]) 

In [784]:
#Fighter that have very many fights but very few in the UFC are discarded from the study
for fighter in range(len(methods)):
  if len(methods[fighter])<5:
    methods[fighter] = 'Insufficient' 

In [785]:
tko = [[0 for columns in range(0)] for rows in range(len(methods))]

for f in range(len(methods)):
  for key, value in enumerate(methods[f]):
    if value=='KO/TKO':
      tko[f].append(methods[f][key])    

In [786]:
#Finally, let's get the tko/win ratio per fighter data into a single list
tko_ratio = []

for f in range(len(methods)):
  try:
    if methods[f]=='Insufficient':
      tko_ratio.append('Insufficient wins')
    else:  
      tko_ratio.append(round(len(tko[f])/len(methods[f]),2))
  
  except:
    tko_ratio.append('Insufficient wins')  

In [787]:
#Concatenating it to our original data
df['tko_win_ratio'] = pd.Series(tko_ratio).to_frame()

In [790]:
'''Unfortunately, we'll have to lose most of our data since most fighters
are preliminary or at least not on top 20 which means they're may have numerous 
fights overall but very few in the UFC itself. Our purpose is to conduct this
study on fighters that have at least 5 wins in the UFC'''

display(df.shape[0], df[df['tko_win_ratio']=='Insufficient wins'].shape[0])
df = df[df['tko_win_ratio']!='Insufficient wins']

3741

3109

In [792]:
df.tail()

Unnamed: 0,name,last_name,stance,wins,losses,SLpM,fights,tko_win_ratio
3697,Hidehiko,Yoshida,Orthodox,6,8,1.05,14,0.0
3707,Sodiq,Yusuff,Orthodox,11,2,5.43,13,0.4
3714,Elizeu,Zaleski dos Santos,Orthodox,23,7,4.65,30,0.33
3717,Marius,Zaromskis,Orthodox,20,9,5.02,29,0.67
3726,Zhang,Weili,Switch,21,3,5.47,24,0.2


In [793]:
#Saving this data
from google.colab import files
df.to_csv('fighter_stance_tko.csv')
files.download('fighter_stance_tko.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [796]:
'''The initial view suggests that Switch fighters yet again have an advantage, 
however, this isn't conclusive without the tests which is to be conducted in
the next notebook'''

df.groupby('stance')['tko_win_ratio'].agg(['mean', 'count', 'median', 'std']) 

Unnamed: 0_level_0,mean,count,median,std
stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Open Stance,0.52,2,0.52,0.268701
Orthodox,0.341983,479,0.33,0.239193
Southpaw,0.325354,127,0.31,0.254495
Switch,0.385833,24,0.415,0.269507
