# Notebook for scraping UFC-stats pages

In [474]:
def display_img(src, description=None):
    from IPython.display import Image
    from IPython.core.display import HTML 
    if description is not None: print(description)
    display(Image(url=src))

In [475]:
img_name_prefix = ["start", "second", "last"]
for prefix in img_name_prefix:
    display_img(src=f"img/page urls/{prefix}_url.png", 
                description=f"{prefix.capitalize()} url:")

Start url:


Second url:


Last url:


## As we can see above, pattern of ufc stats urls is clear:
 - First page has unique url suffix pattern:
      - Page &emsp;1: '.../completed'
 - The rest of urls have a clear suffix pattern: 
     - Page &emsp;2: '.../completed**?page=2**'
     - Page &emsp;3: '.../completed**?page=3**'
     - ...
     - Page &ensp;24: '.../completed**?page=24**'

In [476]:
front_page_img_name = "front page"
display_img(src=f"img/front page/{front_page_img_name}.png", 
            description="Front page:")

Front page:


In [477]:
def get_ufc_front_pages_urls():
    import requests

    sfx_ptrn = "?page="
    suffixes = [''] # initially only 1 element - empty string, because of the unique start
    suffixes += [sfx_ptrn + str(i) for i in range(2, 25)] # 
    # print(suffixes)
    url_start = "http://ufcstats.com/statistics/events/completed{}" # notice {} at the end

    urls = []
    
    for i, sfx in enumerate(suffixes):
        urls.append(url_start.format(sfx))

    return urls

In [478]:
front_pages_urls = get_ufc_front_pages_urls()
front_pages_urls;

## Let's extract all links of events

In [479]:
event_img_names = ["events page", "event link"]

for img_name in event_img_names:
    display_img(src=f"img/events/{img_name}.png", 
                description=f"{img_name.capitalize()}:")

Events page:


Event link:


In [481]:
# gets all events links from particular page
def get_events_links(page_url):
    from bs4 import BeautifulSoup
    
    page = requests.get(page_url).text
    soup = BeautifulSoup(page, "html.parser")
    tags_with_events_links = soup.find_all('a', class_="b-link b-link_style_black");
    events_links = [i['href'] for i in tags_with_events_links]
    
    return events_links

In [484]:
events_links = [get_events_links(front_page_url) for front_page_url in front_pages_urls]

73

In [460]:
import numpy as np

flattened_events_links = np.concatenate(link_lists, axis=0, dtype="object")
flattened_events_links.shape

(599,)

## We expect links to contain 24 * 25 - 1 links
- 24 - number of pages
- 25 - number of links per page
- 1 &nbsp; - upcoming event on October 10th

In [356]:
assert(24*25 - 1 == len(tbl_links))

# Every event link points to table like this:

In [None]:
display_img('')

## Now, let's get all tables and save them into separate .csv files for each page

In [125]:
import pandas as pd

In [383]:
def file_exists(file_path):
    import os.path
    return os.path.isfile(file_path)

def merge_page_tables(links):
    df = pd.DataFrame()
    for table_link in links:
        df = pd.concat([df, pd.read_html(table_link)[0]], axis=0)
    return df
    
def table_to_csv(df, file_path):
    df.to_csv(file_path, sep=',', index=False)

In [284]:
tmp_df = pd.DataFrame()

# for idx in range(len(link)):
#    tmp_df = pd.concat([tmp_df, pd.read_html(link_lists[0][idx])[0]], axis=0)
    
for page_num, links in enumerate(link_lists):
    if page_num == 1:
        break
    print(page_num + 1)
    df = merge_page_tables(links)
    table_to_csv(df, f"tables/{page_num + 1}.csv")

1


In [278]:
df.index = range(len(df))
df.to_csv(index=None, header=True, sep=',')
df

Unnamed: 0,W/L,Fighter,Kd,Str,Td,Sub,Weight class,Method,Round,Time
0,win,Cory Sandhagen Song Yadong,0 0,94 54,1 2,0 0,Bantamweight,KO/TKO,4,5:00
1,win,Gregory Rodrigues Chidi Njokuani,1 0,48 44,1 0,0 0,Middleweight,KO/TKO Punches,2,1:27
2,win,Andre Fili Bill Algeo,0 0,59 81,1 0,2 0,Featherweight,S-DEC,3,5:00
3,win,Joe Pyfer Alen Amedovski,1 0,14 11,0 0,0 0,Middleweight,KO/TKO Punch,1,3:55
4,win,Rodrigo Nascimento Tanner Boser,0 0,36 62,3 0,1 0,Heavyweight,S-DEC,3,5:00
5,win,Anthony Hernandez Marc-Andre Barriault,0 0,39 25,8 0,2 0,Middleweight,SUB Arm Triangle,3,1:53
6,win,Damon Jackson Pat Sabatini,0 0,11 1,1 0,0 0,Featherweight,KO/TKO Punches,1,1:09
7,win,Trevin Giles Louis Cosce,0 0,25 10,2 1,0 1,Welterweight,U-DEC,3,5:00
8,win,Loma Lookboonmee Denise Gomes,0 0,48 19,4 0,0 3,Women's Strawweight,U-DEC,3,5:00
9,win,Trey Ogden Daniel Zellhuber,0 0,71 52,1 0,0 0,Lightweight,U-DEC,3,5:00


In [262]:
df_page.iloc[32]

W/L                                          win
Fighter         Abus Magomedov  Dustin Stoltzfus
Kd                                          1  0
Str                                         7  0
Td                                          0  0
Sub                                         0  0
Weight class                        Middleweight
Method                           KO/TKO  Punches
Round                                          1
Time                                        0:19
Name: 6, dtype: object