In [3]:
from bs4 import BeautifulSoup, SoupStrainer
import datetime
import numpy as np
import pandas as pd
import pickle
import re
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import string

In [70]:
def scrape_fightmetric(pages=26):
    """Scrape Fightmetric for all UFC fighter wins and losses."""
    letters = string.ascii_lowercase
    urls = [f'http://www.fightmetric.com/statistics/fighters?char={char}&page=all' 
            for char in letters]
    content = []
    for i in range(pages):
        r = requests.get(urls[i])
        if r.status_code == 200:
            print(letters[i], end='')
            content.append(r.text)
        else:
            print(f'Failed to scrape {urls[i]}\nstatus: {temp.status_code}')
    return content

In [84]:
def extract_urls(pages):
    """Extract urls to fighter pages from list of html soup strings."""
    ss = SoupStrainer('a', {'class': 'b-link b-link_style_black'})
    fighter_urls = set()
    for page in pages:
        soup = BeautifulSoup(page, 'lxml', parse_only=ss)
        for s in soup:
            try:
                fighter_urls.add(s['href'])
            except:
                pass
    return list(fighter_urls)

In [93]:
def attr_parser(soup, dtype, attr, element='li'):
    """Use in scrape_fighter_page func to retrieve field from page."""
    current_tag = soup.find(lambda tag: tag.name == element 
                            and attr in tag.text.lower())
    if current_tag and '--' not in current_tag.text:
        cleaned_text = list(current_tag)[-1].strip().strip('.').strip('\"')
        if '%' in cleaned_text:
            cleaned_text = dtype(cleaned_text.strip('%')) / 100
        if dtype == 'date':
            return datetime.datetime.strptime(cleaned_text, '%b %d, %Y')
        else:
            try:
                return dtype(cleaned_text)
            except ValueError:
                return cleaned_text

In [124]:
def scrape_fighter_page(url, verbose=False):
    """Pass in list of fighter urls from extract_urls().
    Scrape each page for W/L record by year, weight class, nationality,
    striking stats, etc.
    """
    sess = requests.Session()
    retry = Retry(total=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    sess.mount('http://', adapter)
    sess.mount('https://', adapter)
    
    stats = {}
    fields = [(int, 'height'), (int, 'weight'), (int, 'reach'), 
              (str, 'stance'), ('date', 'dob'), (float, 'slpm'), 
              (float, 'str. acc'), (float, 'sapm'), (float, 'str. def'),
              (float, 'td avg'), (float, 'td acc'), (float, 'td def'), 
              (float, 'sub. avg')]
    content = sess.get(url).text
    soup = BeautifulSoup(content, 'lxml')
    
    # Parse name(s).
    try:
        stats['name'] = soup.select('.b-content__title-highlight')[0]\
                            .get_text().strip()
        name_list = stats['name'].split(' ')
    except Exception as e:
        print(e)
    else:
        try:
            stats['fname'], stats['lname'] = name_list
        except ValueError:
            stats['fname'] = name_list[0]
            stats['lname'] = ' '.join(name_list[1:])

    # Parse nickname.
    try:
        stats['nick'] = soup.select('.b-content__Nickname')[0].get_text().strip()
    except Exception:
        stats['nick'] = ''
        
    # Parse record.
    try:
        record = re.findall('\d+', soup.select('.b-content__title-record')[0]\
                            .get_text())
    except Exception:
        stats['W'], stats['L'], stats['D'] = None*3
    else:
        try:
            stats['W'], stats['L'], stats['D'] = map(int, record)
        except ValueError:
            stats['W'], stats['L'], stats['D'], stats['NC'] = map(int, record)
        
    # Parse li fields.
    for dtype, attr in fields:
        stats[attr] = attr_parser(soup, dtype, attr)
    
    # Clean up numeric fields.
    if stats['height']:
        temp = [int(i.strip('\"')) for i in stats['height'].split('\'')]
        stats['height'] = temp[0] * 12 + temp[1]
    if stats['weight']:
        stats['weight'] = int(stats['weight'].split(' ')[0])
    if verbose:
        print(stats['name'] + ' - scraped')
    return stats

In [125]:
def scrape_all(f_urls, limit=None, **kwargs):
    """Return list of dicts of fighter stats. Set limit to small integer when
    testing.
    """
    output = []
    for i, url in enumerate(f_urls[:limit]):
        stats = scrape_fighter_page(url, **kwargs)
        output.append(stats)
        if i % 100 == 0:
            print(f"Round {i}: scraped {stats['name']}")
    return output

In [13]:
scrape_file = 'scrape.pkl'
df_file = 'scrape.csv'

In [5]:
# pages = scrape_fightmetric()

In [6]:
# f_urls = extract_urls(pages)

In [12]:
# Fighter data collected 4/3/2019.
# data = scrape_all(f_urls, None, verbose=False)

In [8]:
# with open(scrape_file, 'wb') as f:
#     pickle.dump(data, f)

In [9]:
with open(scrape_file, 'rb') as f:
    data = pickle.load(f)

In [14]:
df = pd.DataFrame(data)
df.to_csv(df_file, index=False)