# Notebook for scraping UFC-stats pages

In [537]:
def display_img(src, description=None):
    from IPython.display import Image
    from IPython.core.display import HTML 
    if description is not None: print(description)
    display(Image(url=src))

In [563]:
img_name_prefix = ["start", "second", "last"]
for prefix in img_name_prefix:
    display_img(src=f"img/front page urls/{prefix}_url.png", 
                description=f"{prefix.capitalize()} url:")

Start url:


Second url:


Last url:


## As we can see above, pattern of ufc stats urls is clear:
 - First page has unique url suffix pattern:
      - Page &emsp;1: '.../completed'
 - The rest of urls have a clear suffix pattern: 
     - Page &emsp;2: '.../completed**?page=2**'
     - Page &emsp;3: '.../completed**?page=3**'
     - ...
     - Page &ensp;24: '.../completed**?page=24**'

In [541]:
front_page_img_name = "front page"
display_img(src=f"img/front page/{front_page_img_name}.png", 
            description="Front page:")

Front page:


In [543]:
def get_ufc_front_pages_urls():
    import requests

    sfx_ptrn = "?page="
    suffixes = [''] # initially only 1 element - empty string, because of the unique start
    suffixes += [sfx_ptrn + str(i) for i in range(2, 25)] # 
    # print(suffixes)
    url_start = "http://ufcstats.com/statistics/events/completed{}" # notice {} at the end

    urls = []
    
    for i, sfx in enumerate(suffixes):
        urls.append(url_start.format(sfx))

    return urls

In [545]:
front_pages_urls = get_ufc_front_pages_urls()
front_pages_urls;

## Let's extract all links of events

In [547]:
event_img_names = ["events page", "event link"]

for img_name in event_img_names:
    display_img(src=f"img/events/{img_name}.png", 
                description=f"{img_name.capitalize()}:")

Events page:


Event link:


In [549]:
# gets all events links from particular page
def get_events_links(page_url):
    from bs4 import BeautifulSoup
    
    page = requests.get(page_url).text
    soup = BeautifulSoup(page, "html.parser")
    tags_with_events_links = soup.find_all('a', class_="b-link b-link_style_black")
    events_links = [i['href'] for i in tags_with_events_links]
    
    return events_links

In [551]:
page_event_links_list = [get_events_links(front_page_url) for front_page_url in front_pages_urls]

In [552]:
'''
import numpy as np

flattened_events_links = np.concatenate(link_lists, axis=0, dtype="object")
flattened_events_links.shape
''';

## We expect links to contain 24 * 25 - 1 links
- 24 - number of pages
- 25 - number of links per page
- 1 &nbsp; - upcoming event on October 10th

In [564]:
assert(24*25 - 1 == sum((len(links) for links in page_event_links_list)))

# Every event link points to page with that kind of table:

In [738]:
display_img('img/events/event description.png', 'Fights of event:')

Fights of event:


# Proceeding further by clicking on any row leads us to table with fight details:

In [739]:
display_img('img/fights/fight info.png', 'Fight details:')

Fight details:


# Let's extract all links to fight tables

In [742]:
display_img('img/events/fight row.png', '')
display_img('img/events/fight link.png', 'Link to fight table:')




Link to fight table:


In [833]:
def get_links_to_tables(event_url):
    page = requests.get(event_url)
    soup = BeautifulSoup(page.text, 'html.parser')
    rows = soup.findAll('tr', class_="b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click")
    return [row['onclick'][7:-2] for row in rows]

In [869]:
def file_exists(file_path):
    import os.path
    return os.path.isfile(file_path) 

In [878]:
# print(get_links_to_tables(page_event_links_list[0][0]))

with open(f"tables/urls/page_1", 'w+') as f:
    [f.write(f'{fight_table_url}\n') and print('writing:', f'{fight_table_url}\n') 
for fight_table_url in get_links_to_tables(page_event_links_list[0][0])]

writing: http://ufcstats.com/fight-details/df370b97ea49e6b4

writing: http://ufcstats.com/fight-details/bd06f1c2d7e6c4a0

writing: http://ufcstats.com/fight-details/adb78db5ba8ccc0f

writing: http://ufcstats.com/fight-details/160c17b8fcb0e2a1

writing: http://ufcstats.com/fight-details/3cbd3565dd631cc4

writing: http://ufcstats.com/fight-details/ce5ebaf8caf8c4a7

writing: http://ufcstats.com/fight-details/f521ecde1e96a1bd

writing: http://ufcstats.com/fight-details/46b9acec12a35d37

writing: http://ufcstats.com/fight-details/ec0740be19b2ef90

writing: http://ufcstats.com/fight-details/35d06627d91b840d

writing: http://ufcstats.com/fight-details/d9f5d05b07bae09e

writing: http://ufcstats.com/fight-details/be73f75b52b06f14

writing: http://ufcstats.com/fight-details/8a044e5400b4318f



In [885]:
for page_idx, event_urls in enumerate(page_event_links_list):
    print("Page" + str(page_idx) + '------------------------------------------------------------------------')
    file_path = file_path = f"tables/urls/page_{page_idx+1}"
    # if we already have table urls for page[page_idx], we don't want need to send requests again
    if file_exists(file_path) is False:
        for event_idx, event_url in enumerate(event_urls):
            # print("Event url:")
            # print(str(event))
            fight_table_urls = get_links_to_tables(event_url)
            for fight_table_url in fight_table_urls:
                with open(file_path, 'a') as f:
                    f.write(f'{fight_table_url}\n')
                    # print('writing:', f'{fight_table_url}')
    print(f'Page_{page_idx} done!')

Page0------------------------------------------------------------------------
Page_0 done!
Page1------------------------------------------------------------------------
Page_1 done!
Page2------------------------------------------------------------------------
Page_2 done!
Page3------------------------------------------------------------------------
Page_3 done!
Page4------------------------------------------------------------------------
Page_4 done!
Page5------------------------------------------------------------------------
Page_5 done!
Page6------------------------------------------------------------------------
Page_6 done!
Page7------------------------------------------------------------------------
Page_7 done!
Page8------------------------------------------------------------------------
Page_8 done!
Page9------------------------------------------------------------------------
Page_9 done!
Page10------------------------------------------------------------------------
Page_10 don

In [841]:
print(get_links_to_tables(page_event_links_list[0][23]))

['http://ufcstats.com/fight-details/06641a8c62e45661', 'http://ufcstats.com/fight-details/ce3d94186567a684', 'http://ufcstats.com/fight-details/001441f70c293931', 'http://ufcstats.com/fight-details/bcd1e1d71a172d53', 'http://ufcstats.com/fight-details/5a878a6bebc973d7', 'http://ufcstats.com/fight-details/a200b5dcbdd2506e', 'http://ufcstats.com/fight-details/6e705b2365bc03fa', 'http://ufcstats.com/fight-details/ae989e21c3839b49', 'http://ufcstats.com/fight-details/766b4f5e77ab0860', 'http://ufcstats.com/fight-details/53d0df57a917c6a0', 'http://ufcstats.com/fight-details/5e378c52f26935d6', 'http://ufcstats.com/fight-details/4c7e55061dc3bf89']


In [834]:
first_event_fight_tables = get_links_to_tables(first_event_url)

In [835]:
first_event_fight_tables

['http://ufcstats.com/fight-details/df370b97ea49e6b4',
 'http://ufcstats.com/fight-details/bd06f1c2d7e6c4a0',
 'http://ufcstats.com/fight-details/adb78db5ba8ccc0f',
 'http://ufcstats.com/fight-details/160c17b8fcb0e2a1',
 'http://ufcstats.com/fight-details/3cbd3565dd631cc4',
 'http://ufcstats.com/fight-details/ce5ebaf8caf8c4a7',
 'http://ufcstats.com/fight-details/f521ecde1e96a1bd',
 'http://ufcstats.com/fight-details/46b9acec12a35d37',
 'http://ufcstats.com/fight-details/ec0740be19b2ef90',
 'http://ufcstats.com/fight-details/35d06627d91b840d',
 'http://ufcstats.com/fight-details/d9f5d05b07bae09e',
 'http://ufcstats.com/fight-details/be73f75b52b06f14',
 'http://ufcstats.com/fight-details/8a044e5400b4318f']

## Now, let's get all fight tables of all events for each page:
For each page:
- fix page and dataframe
- go through all of its event links
- for each event link, proceed to every fight detail link
- get table
- add record into dataframe 

In [598]:
import pandas as pd