In [703]:
import json
import re

import requests
import scrapy

In [704]:
headers = {'User-Agent': 'UNC Journo Class'}

In [705]:
#map to the site we want
base_url = 'http://goheels.com'
url = base_url + '/roster.aspx?path=baseball'

In [706]:
#requesting the page
resp = requests.get(url, headers=headers)

In [707]:
#turn page into html
body_str = resp.content.decode('utf-8')

In [708]:
#select the body of the the html page
sel = scrapy.Selector(text=body_str)

In [709]:
#find and select the table
table = sel.css('table')[0]

In [710]:
#test to make sure we selected the right table
table

<Selector xpath='descendant-or-self::table' data='<table class="sidearm-table sidearm-tabl'>

In [711]:
#select columns
cols = table.css('th').xpath('string()').extract()

In [712]:
#column test
cols

['#',
 'Full Name',
 'Pos.',
 'Ht.',
 'Wt.',
 'Academic Year',
 'Hometown / High School']

In [713]:
rows = table.css('tr')[1:]

In [714]:
rows

[<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>,
 <Selector xpath='descendant-or-self::tr' data='<tr>\r\n    

In [715]:
#creates dictionary for each player in the table and their info and adds each into the list called "players"
players = []
for r in rows:
    data = {}
    for i, d in enumerate(r.css('td')):
        a = d.css('a')
        if a:
            t = a.xpath('text()').extract()[0]
            data['href'] = a.xpath('@href').extract()[0]
        else:
            t = d.xpath('text()').extract()[0]
        data[cols[i]] = t
    players.append(data)

In [716]:
#test to ensure it all worked
players

[{'#': '1',
  'Academic Year': 'Jr.',
  'Full Name': 'Brandon Riley',
  'Hometown / High School': 'Burlington, N.C. / Williams',
  'Ht.': '6-0',
  'Pos.': 'OF',
  'Wt.': '175',
  'href': '/roster.aspx?rp_id=14221'},
 {'#': '2',
  'Academic Year': 'Fr.',
  'Full Name': 'Satchel Jerzembeck',
  'Hometown / High School': 'Charlotte, N.C. / Providence',
  'Ht.': '5-10',
  'Pos.': 'IF',
  'Wt.': '150',
  'href': '/roster.aspx?rp_id=14226'},
 {'#': '3',
  'Academic Year': 'Jr.',
  'Full Name': 'Kyle Datres',
  'Hometown / High School': 'Williamsport, Pa. / Loyalsock Township',
  'Ht.': '6-0',
  'Pos.': '3B',
  'Wt.': '198',
  'href': '/roster.aspx?rp_id=14211'},
 {'#': '4',
  'Academic Year': 'So.',
  'Full Name': 'Brandon Martorano',
  'Hometown / High School': 'Marlboro, N.J. / Christian Brothers Academy',
  'Ht.': '6-2',
  'Pos.': 'C/OF',
  'Wt.': '187',
  'href': '/roster.aspx?rp_id=14219'},
 {'#': '5',
  'Academic Year': 'So.',
  'Full Name': 'Ashton McGee',
  'Hometown / High School': '

In [717]:
#uses url in each player dictionary to dig into player page
#scrapes entire html of page
#then targets and stores each players bio and profile image
def fetch_bio(player):
    #unique player url
    player_url = base_url + player['href']
    #make request to player page
    resp = requests.get(player_url, headers=headers)
    #turn page into html
    player_txt = resp.content.decode('utf-8')
    #select the text on the page
    sel = scrapy.Selector(text=player_txt)
    player['sel'] = sel
    player['bio'] = sel.css('#sidearm-roster-player-bio').xpath('string()').extract()[0]
    player['img'] = sel.css('.sidearm-roster-player-image img').xpath('@src').extract()[0]

In [718]:
#REGEX to activate and select the stats table
js_obj_rx = re.compile(r'.*?responsive-roster-bio\.ashx.*?(?P<obj>{.*?})')

In [719]:
def fetch_stats(player):
    text = player['sel'].xpath('string()').extract()[0]
    parts = text.split('$.getJSON("/services/')[1:]
    captured = js_obj_rx.findall(''.join(parts))
    clean_objs = []
    for obj_str in captured:
        # We only want the stats object...
        if 'stats' not in obj_str:
            continue

        #cleaning up all the ugly data
        obj_str = obj_str.replace('{', '').replace('}', '')
        obj_str = obj_str.replace("'", '').replace('"', '')
        obj_pairs = obj_str.split(',')
        obj_pairs = [x.split(":") for x in obj_pairs]
        clean_pairs = []
        for pair in obj_pairs:
            clean_pairs.append(['"{}"'.format(p.strip()) for p in pair])
        colonized = [":".join(p) for p in clean_pairs]
        commas = ','.join(colonized)
        json_str = "{" + commas + "}"
        clean_objs.append(json.loads(json_str))
    
    player['stats_url'] = stats_url = (
        "http://goheels.com/services/responsive-roster-bio.ashx?"
        "type={type}&rp_id={rp_id}&path={path}&year={year}"
        "&player_id={player_id}"
    ).format(**clean_objs[0])
    
    print('Fetch stats', stats_url)
    resp = requests.get(stats_url, headers=headers)
    json_stats = json.loads(resp.content.decode("utf-8"))
    #create list of raw stats for each player
    player['raw_stats'] = json_stats

In [720]:
players

[{'#': '1',
  'Academic Year': 'Jr.',
  'Full Name': 'Brandon Riley',
  'Hometown / High School': 'Burlington, N.C. / Williams',
  'Ht.': '6-0',
  'Pos.': 'OF',
  'Wt.': '175',
  'href': '/roster.aspx?rp_id=14221'},
 {'#': '2',
  'Academic Year': 'Fr.',
  'Full Name': 'Satchel Jerzembeck',
  'Hometown / High School': 'Charlotte, N.C. / Providence',
  'Ht.': '5-10',
  'Pos.': 'IF',
  'Wt.': '150',
  'href': '/roster.aspx?rp_id=14226'},
 {'#': '3',
  'Academic Year': 'Jr.',
  'Full Name': 'Kyle Datres',
  'Hometown / High School': 'Williamsport, Pa. / Loyalsock Township',
  'Ht.': '6-0',
  'Pos.': '3B',
  'Wt.': '198',
  'href': '/roster.aspx?rp_id=14211'},
 {'#': '4',
  'Academic Year': 'So.',
  'Full Name': 'Brandon Martorano',
  'Hometown / High School': 'Marlboro, N.J. / Christian Brothers Academy',
  'Ht.': '6-2',
  'Pos.': 'C/OF',
  'Wt.': '187',
  'href': '/roster.aspx?rp_id=14219'},
 {'#': '5',
  'Academic Year': 'So.',
  'Full Name': 'Ashton McGee',
  'Hometown / High School': '

In [721]:
for p in players:
    fetch_bio(p)
    fetch_stats(p)

Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14221&path=baseball&year=2018&player_id=3746
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14226&path=baseball&year=2018&player_id=0
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14211&path=baseball&year=2018&player_id=3736
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14219&path=baseball&year=2018&player_id=3760
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14220&path=baseball&year=2018&player_id=3761
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14228&path=baseball&year=2018&player_id=5394
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14229&path=baseball&year=2018&player_id=5395
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14212&path=baseb

In [722]:
players[0]

{'#': '1',
 'Academic Year': 'Jr.',
 'Full Name': 'Brandon Riley',
 'Hometown / High School': 'Burlington, N.C. / Williams',
 'Ht.': '6-0',
 'Pos.': 'OF',
 'Wt.': '175',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            Dynamic outfielder who figures to be among the team leaders as a junior.\r\n\r\nSophomore Season (2017)\r\nHit .317 with seven homers, nine doubles and a team-high 52 RBIs •\xa0Also scored 46 runs, drew 37 walks and led the team with five triples, which is tied for 10th-most in UNC history for a single season • Appeared in all 63 games and started 62, primarily in right field • Stole 10 bases in 14 attempts • Had 21 multi-hit games, including a season-high three hits on eight occasions • Also drove in multiple runs 14 times, including a season-best four at ECU on March 22 • Followed up the ECU game with a big series against Miami, recording six hits and driving in four runs with a homer 

In [723]:
p = [p for p in players if p['Full Name'] == 'Brandon Riley'][0]

In [724]:
txt = p['raw_stats']['career_stats']

In [725]:
sel = scrapy.Selector(text=txt)

In [726]:
sel.css('section')

[<Selector xpath='descendant-or-self::section' data='<section>\r\n                <h5>Hitting S'>]

In [727]:
def parse_stats(player):
    stats = {}
    for raw_key, raw_val in player['raw_stats'].items():
        txt = player['raw_stats'][raw_key]
        if not txt:
            print('Skipping {} for {}'.format(raw_key, player['Full Name']))
            continue
        sel = scrapy.Selector(text=txt)
        # Get all the tables
        for section in sel.css('section'):
            title = section.css('h5').xpath('string()').extract()[0]
            cols = section.css('tr')[0].css('th').xpath('string()').extract()
            print('NEW SECTION', title)
            print('COLS', cols)
            these_stats = []
            print('TRS', section.css('tr'))
            for r in section.css('tr')[1:]:
                print('row', r.xpath('string()').extract()[0].replace('\r', '').replace('\n', '').strip())
                s = {}
                for i, d in enumerate(r.css('td'), 1):
                    s[cols[i].lower()] = d.xpath('string()').extract()[0]
                    yr = r.css('th').xpath('string()')
                if yr:
                    yr = yr.extract()[0]
                    if yr.lower() in ('total', 'season'):
                        print('SKIPPING...')
                        continue
                    print('THE YR IS', yr)
                    s['year'] = yr
                these_stats.append(s)
                print('THE STATS ARE', these_stats)
            existing = stats.get(raw_key, {})
            existing[title] = these_stats
            stats[raw_key] = existing
    player['stats'] = stats

In [728]:
p = [p for p in players if p['Full Name'] == 'Brandon Riley'][0]
parse_stats(p)

NEW SECTION Hitting Statistics
COLS ['Date', 'Opponent', 'W/L', 'GS', 'AB', 'R', 'H', 'RBI', '2B', '3B', 'HR', 'BB', 'IBB', 'SB', 'SBA', 'CS', 'HBP', 'SH', 'SF', 'GDP', 'K', 'AVG']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/16/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/20/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/21/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/23/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t

IndexError: list index out of range

In [729]:
for p in players:
    parse_stats(p)

NEW SECTION Hitting Statistics
COLS ['Date', 'Opponent', 'W/L', 'GS', 'AB', 'R', 'H', 'RBI', '2B', '3B', 'HR', 'BB', 'IBB', 'SB', 'SBA', 'CS', 'HBP', 'SH', 'SF', 'GDP', 'K', 'AVG']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/16/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/20/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/21/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/23/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t

IndexError: list index out of range

In [738]:
[p for p in players if p['Full Name'] == 'Brandon Riley'][0]

{'#': '1',
 'Academic Year': 'Jr.',
 'Full Name': 'Brandon Riley',
 'Hometown / High School': 'Burlington, N.C. / Williams',
 'Ht.': '6-0',
 'Pos.': 'OF',
 'Wt.': '175',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            Dynamic outfielder who figures to be among the team leaders as a junior.\r\n\r\nSophomore Season (2017)\r\nHit .317 with seven homers, nine doubles and a team-high 52 RBIs •\xa0Also scored 46 runs, drew 37 walks and led the team with five triples, which is tied for 10th-most in UNC history for a single season • Appeared in all 63 games and started 62, primarily in right field • Stole 10 bases in 14 attempts • Had 21 multi-hit games, including a season-high three hits on eight occasions • Also drove in multiple runs 14 times, including a season-best four at ECU on March 22 • Followed up the ECU game with a big series against Miami, recording six hits and driving in four runs with a homer 

In [754]:
players

[{'#': '1',
  'Academic Year': 'Jr.',
  'Full Name': 'Brandon Riley',
  'Hometown / High School': 'Burlington, N.C. / Williams',
  'Ht.': '6-0',
  'Pos.': 'OF',
  'Wt.': '175',
  'bio': '\r\n                        Biography\r\n                                                    \r\n                            Dynamic outfielder who figures to be among the team leaders as a junior.\r\n\r\nSophomore Season (2017)\r\nHit .317 with seven homers, nine doubles and a team-high 52 RBIs •\xa0Also scored 46 runs, drew 37 walks and led the team with five triples, which is tied for 10th-most in UNC history for a single season • Appeared in all 63 games and started 62, primarily in right field • Stole 10 bases in 14 attempts • Had 21 multi-hit games, including a season-high three hits on eight occasions • Also drove in multiple runs 14 times, including a season-best four at ECU on March 22 • Followed up the ECU game with a big series against Miami, recording six hits and driving in four runs with 

In [748]:
to_dump = [p.copy() for p in players]
for p in to_dump:
    p.pop('sel')
    for k in list(p.keys()):
        if 'raw' in k:
            p.pop(k)
with open('baseball_team_SCRAPED.json', 'w') as f:
    json.dump(to_dump, f)

In [749]:
cat baseball_team_SCRAPED.json | cut -c 1-100

[{"#": "1", "href": "/roster.aspx?rp_id=14221", "Full Name": "Brandon Riley", "Pos.": "OF", "Ht.": "


In [750]:
to_dump[0]

{'#': '1',
 'Academic Year': 'Jr.',
 'Full Name': 'Brandon Riley',
 'Hometown / High School': 'Burlington, N.C. / Williams',
 'Ht.': '6-0',
 'Pos.': 'OF',
 'Wt.': '175',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            Dynamic outfielder who figures to be among the team leaders as a junior.\r\n\r\nSophomore Season (2017)\r\nHit .317 with seven homers, nine doubles and a team-high 52 RBIs •\xa0Also scored 46 runs, drew 37 walks and led the team with five triples, which is tied for 10th-most in UNC history for a single season • Appeared in all 63 games and started 62, primarily in right field • Stole 10 bases in 14 attempts • Had 21 multi-hit games, including a season-high three hits on eight occasions • Also drove in multiple runs 14 times, including a season-best four at ECU on March 22 • Followed up the ECU game with a big series against Miami, recording six hits and driving in four runs with a homer 

In [751]:
import numpy as np
import pandas as pd

In [752]:
data = json.load(open('baseball_team_SCRAPED.json'))

In [753]:
data[0]

{'#': '1',
 'Academic Year': 'Jr.',
 'Full Name': 'Brandon Riley',
 'Hometown / High School': 'Burlington, N.C. / Williams',
 'Ht.': '6-0',
 'Pos.': 'OF',
 'Wt.': '175',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            Dynamic outfielder who figures to be among the team leaders as a junior.\r\n\r\nSophomore Season (2017)\r\nHit .317 with seven homers, nine doubles and a team-high 52 RBIs •\xa0Also scored 46 runs, drew 37 walks and led the team with five triples, which is tied for 10th-most in UNC history for a single season • Appeared in all 63 games and started 62, primarily in right field • Stole 10 bases in 14 attempts • Had 21 multi-hit games, including a season-high three hits on eight occasions • Also drove in multiple runs 14 times, including a season-best four at ECU on March 22 • Followed up the ECU game with a big series against Miami, recording six hits and driving in four runs with a homer 

In [None]:
#I can't seem to get the stats into the json file. I've followed your example step by step and am still
#confused about why it isn't working for me.
#I shouldn'y have waited until the last minute on this,
#so I take full responsibility for its incompleteness.
#I'd love to figure this out after the break though.