# DataScraping

In this notebook I scrape the necessary data for these studies from onequestionshootout.xyz

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
# Delay used to be kind to the server owner and keep the requests at a human level
TIME_DELAY = 20

In [3]:
def scrape_episodes(series_limit = 15):
    """
    Will scrape the html table for series <SERIES> from http://onequestionshootout.xyz/episodes/series_<SERIES>.htm 
    Function iterates from 1,2,...,series_limit and stacks each dataframe
    Returns the stacked dataframe
    """

    df_list = []
    
    for series in range(1,series_limit+1):
        
        print("Scraping series: {}".format(series))
        if series > 1:
            time.sleep(TIME_DELAY)
                        
        data_url = "http://onequestionshootout.xyz/episodes/series_{}.htm".format(series)
        try:
            tables = pd.read_html(data_url)
        except URLError:
            break
        
        df_series = tables[1]
        df_series["Series"] = series
        
        df_list.append(df_series)
        
    df_episodes = pd.concat(df_list)
    print("\nTotal Episodes Scraped: {}".format(len(df_episodes)))
    return df_episodes

In [4]:
df_episodes = scrape_episodes(15)

Scraping series: 1
Scraping series: 2
Scraping series: 3
Scraping series: 4
Scraping series: 5
Scraping series: 6
Scraping series: 7
Scraping series: 8
Scraping series: 9
Scraping series: 10
Scraping series: 11
Scraping series: 12
Scraping series: 13
Scraping series: 14
Scraping series: 15

Total Episodes Scraped: 1883


In [5]:
df_episodes.to_csv("../data/all_episodes.csv",index=False)

In [6]:
def scrape_players(series_limit = 15):
    """
    Will scrape the html table for players <SERIES> from http://onequestionshootout.xyz/players/series_<SERIES>.htm 
    Function iterates from 1,2,...,series_limit and stacks each dataframe
    Returns the stacked dataframe
    """

    df_list = []
    
    for series in range(1,series_limit+1):
        
        print("Scraping series: {}".format(series))
        if series > 1:
            time.sleep(TIME_DELAY)
                        
        data_url = "http://onequestionshootout.xyz/players/series_{}.htm".format(series)
        try:
            tables = pd.read_html(data_url)
        except URLError:
            break
        
        df_series = tables[1]
        df_series["Series"] = series
        
        df_list.append(df_series)
        
    df_players = pd.concat(df_list)
    print("\nTotal Players Scraped: {}".format(len(df_players)))
    return df_players

In [7]:
df_players = scrape_players(15)

Scraping series: 1
Scraping series: 2
Scraping series: 3
Scraping series: 4
Scraping series: 5
Scraping series: 6
Scraping series: 7
Scraping series: 8
Scraping series: 9
Scraping series: 10
Scraping series: 11
Scraping series: 12
Scraping series: 13
Scraping series: 14
Scraping series: 15

Total Players Scraped: 7532


In [8]:
df_players.to_csv("../data/all_players.csv",index=False)