# PandaSoup Scraper Tool

Parse data from websites into a pandas DataFrame for each page.

Start with a URL template, for example: "http://www.google.com?q1={}&q2={}&q3={}"

Each time we perform a scrape, we iterate over the list of params to plug into the template:
[
    ('hi', 'stuff', '123'),
    ('foo', '999', '2018/07/04'),
    ...
]

Each of these tuples creates a new URL, which we scrape with BeautifulSoup. Results are saved in the "raw_data" dictionary as values under its parameter tuple as the key.

Given an extract function (extract_func) to pull certain items out of each page of raw data to a dataframe, store in the list of dataframes. By default, the dict of pages is simply transformed to a dataframe with a 1-1 relationship, keeping the same key tuples.

User can give a "grouping strategy" to consolidate data based on parameters. 

Results can be saved to disk as a CSV file per DataFrame object.

Example: http://www.mywebsite.com?firstname={}&lastname={}&year={}&month={}&day={}, ["year", "month"]
* By default, data entities will be separated for each parameter combo: myfile_firstname_lastname_year_month_day.csv
* Using the strategy, they will be consolidated by year and month: myfile_year_month.csv

## Constructor args:
Pass a single dictionary. The following parameters are recognized.

    params_dict = {
        base : "http://www.mywebsite.com?q={}"
        param_names : [keyword1, keyword2, keyword3...]
        csv_base : "my/data/path/{}_{}_{}.csv"
        request_delay : 123
        extract_func: (lambda x: x)
        grouping_strategy : [keyword1, keyword2... ]
}


In [130]:
import math
import os
import pandas as pd
import re
import requests
import time

from __future__ import print_function
from bs4 import BeautifulSoup
from collections import defaultdict
from copy import deepcopy
# TODO: Add selenium functionality later

class PandaSoup:
    raw_data = defaultdict(list)
    request_delay = 5  # 5 seconds
    data = defaultdict(list)
    extract_func = (lambda x: x)
    grouping_strategy = None
    init_params = {}
    use_timestamped_csv = True
    
    def __init__(self, params_dict):
        self.base = params_dict.get('base', None)
        self.param_names = params_dict.get('param_names', None)
        self.grouping_strategy = params_dict.get('grouping_strategy', None)
        # If user specified a custom csv, don't use timestamp in file names
        if 'csv_base' in params_dict:
            self.use_timestamped_csv = False
            self.csv_base = params_dict['csv_base']
        elif 'grouping_strategy' in params_dict:
            self.csv_base = str.format("test/data/{}.csv", 
                                       "{}"+("_{}"*len(self.grouping_strategy)))
        else:
            self.csv_base = str.format("test/data/{}.csv", 
                                       "{}"+("_{}"*len(self.param_names)))
        self.request_delay = params_dict.get('request_delay', 5)
        self.extract_func = params_dict.get('extract_func', (lambda x: x))
        self.init_params = params_dict
        
    def debug_str(self, level=1):
        print("Raw data:", len(self.raw_data), "items")
        if level == 2:
            print("    keys:", self.raw_data.keys())
        print("Data:", len(self.data), "items")
        print("Default output path:", self.csv_base)
        if level == 2:
            print("    keys:", self.data.keys())
            print("Initialized with params:", self.init_params)
        
    def scrape(self, params=[], verbose=None):
        url = str.format(self.base, *params)
        if verbose:
            print("Reading", url)
        response = requests.get(url)
        return BeautifulSoup(response.text,"lxml")
        
    def scrape_all(self, params=[], reset=False, verbose=None):
        if reset:
            self.raw_data = {}
        for param_tuple in params:
            key = param_tuple
            if self.grouping_strategy:
                key = tuple(
                    [param_tuple[self.param_names.index(p)] for p in self.grouping_strategy]
                )
            if verbose:
                print("parameter values", param_tuple, "--> key", key)
            self.raw_data[key].append(self.scrape(param_tuple, verbose))
            time.sleep(self.request_delay)
        return self.raw_data
    
    def make_dataframes(self, reset=False, verbose=None):
        if reset:
            print("Clear dataframes")
            self.data = {}
        count = 0
        for key,value in self.raw_data.items():
            df = [self.extract_func(v) for v in value]
            self.data[key] = pd.concat(df)
            if verbose:
                print(key, "yielded", len(df), "rows of data")
        if verbose:
            print("Completed extracting data")
        return self.data
    
    def to_csv(self, verbose=None):
        files = []
        for key,value in self.data.items():
            file = self.csv_base 
            if self.use_timestamped_csv:
                file = str.format(self.csv_base, math.floor(time.time()*10), *key)
            else:
                file = str.format(self.csv_base, *key)
            if verbose:
                print("Writing to", file)
            value.to_csv(file)
            files.append(file)
        return files
        
    def reset_all(self,verbose=None):
        if verbose:
            print("Clearing data")
        self.raw_data = {}
        self.data = {}


## Testing

In [2]:
!mkdir test
!mkdir test/data

In [98]:
def f(soup):
    players = {}
    for n, link in enumerate(soup.find_all(href=re.compile('/stats/players'))):
        name = link.text
        if name != "players":
            row = link.findParent().findParent()
            players[n] = [td.text for td in row.find_all('td')]
            players[n][0] = name
    return pd.DataFrame(players).T

test_params = {
    'base' : "http://fftoday.com/stats/playerstats.php?Season={}&GameWeek={}&PosID={}",
    'param_names' : ["season", "week", "position"],
    'request_delay' : 3,
    'extract_func': f,
}

In [135]:
# Default behavior
twp = PandaSoup(test_params)
twp.scrape_all([
    (2015, 5, 10), 
    (2016, 6, 10)
], verbose=True)
twp.make_dataframes(verbose=True)
twp.to_csv(verbose=True)

parameter values (2015, 5, 10) --> key (2015, 5, 10)
Reading http://fftoday.com/stats/playerstats.php?Season=2015&GameWeek=5&PosID=10
parameter values (2016, 6, 10) --> key (2016, 6, 10)
Reading http://fftoday.com/stats/playerstats.php?Season=2016&GameWeek=6&PosID=10
(2015, 5, 10) yielded 2 rows of data
(2016, 6, 10) yielded 2 rows of data
Completed extracting data
Writing to test/data/15315214257_2015_5_10.csv
Writing to test/data/15315214257_2016_6_10.csv


['test/data/15315214257_2015_5_10.csv', 'test/data/15315214257_2016_6_10.csv']

In [121]:
# Grouping strategy
test_params_with_group = deepcopy(test_params)
test_params_with_group['grouping_strategy'] = ['season', 'week']

twp = PandaSoup(test_params_with_group)
twp.scrape_all([
    (2014, 3, 10), 
    (2015, 10, 10), 
], verbose=True)
twp.make_dataframes(verbose=True)
twp.to_csv(verbose=True)

parameter values (2014, 3, 10) --> key (2014, 3)
Reading http://fftoday.com/stats/playerstats.php?Season=2014&GameWeek=3&PosID=10
parameter values (2015, 10, 10) --> key (2015, 10)
Reading http://fftoday.com/stats/playerstats.php?Season=2015&GameWeek=10&PosID=10
(2014, 3) yielded 1 rows of data
(2015, 10) yielded 1 rows of data
Completed extracting data
Writing to test/data/15315209433_2014_3.csv
Writing to test/data/15315209433_2015_10.csv


['test/data/15315209433_2014_3.csv', 'test/data/15315209433_2015_10.csv']

In [133]:
# custom CSV
test_params_with_csv = deepcopy(test_params)
test_params_with_csv['csv_base'] = "test/data/{}_{}_{}.csv"

twp = PandaSoup(test_params_with_csv)
twp.debug_str(2)
twp.scrape_all([
    (2015, 5, 10), 
    (2016, 6, 10)
], verbose=True)
twp.make_dataframes(verbose=True)
twp.to_csv(verbose=True)

Raw data: 0 items
    keys: dict_keys([])
Data: 0 items
Default output path: test/data/{}_{}_{}.csv
    keys: dict_keys([])
Initialized with params: {'base': 'http://fftoday.com/stats/playerstats.php?Season={}&GameWeek={}&PosID={}', 'param_names': ['season', 'week', 'position'], 'request_delay': 3, 'extract_func': <function f at 0x7f532be9bbf8>, 'csv_base': 'test/data/{}_{}_{}.csv'}
parameter values (2015, 5, 10) --> key (2015, 5, 10)
Reading http://fftoday.com/stats/playerstats.php?Season=2015&GameWeek=5&PosID=10
parameter values (2016, 6, 10) --> key (2016, 6, 10)
Reading http://fftoday.com/stats/playerstats.php?Season=2016&GameWeek=6&PosID=10
(2015, 5, 10) yielded 1 rows of data
(2016, 6, 10) yielded 1 rows of data
Completed extracting data
Writing to test/data/2015_5_10.csv
Writing to test/data/2016_6_10.csv


['test/data/2015_5_10.csv', 'test/data/2016_6_10.csv']