## Notebook Overview

Take the data from notebook 1, and flatten the nested data structures so it can be put into sql-database-friendly form.



In [1]:
# Use w/ Anaconda Distribution
# For data manipulation
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
PROJ_ROOT = os.pardir

# For Scraping
from bs4 import BeautifulSoup
import requests
import time

# For munging
import re
import json

# Backoff time for large scrapes
THROTTLE_TIME = .05

# ipython magics
%load_ext watermark
%matplotlib inline

In [3]:
%watermark -a "Cameron Yick" -d -t -v -p pandas,seaborn,matplotlib,bs4 -g

Cameron Yick 2016-10-22 20:17:59 

CPython 2.7.12
IPython 4.0.1

pandas 0.17.1
seaborn 0.7.1
matplotlib 1.5.0
bs4 4.4.1
Git hash: 0e2c5be1749cd9efc0b56859efcaf8a352748f5d


In [22]:
RAW_PATH = os.path.join(PROJ_ROOT, "data", "raw", "sports.json")
RAW_PICK = os.path.join(PROJ_ROOT, "data", "raw", "sports.p")

In [25]:
sports = pd.read_pickle(RAW_PICK)

## Can we check how the size of a roster has changed each year, for each sport?

A roster is a collection of players for 1 season.

In [31]:
sports.columns

Index([u'href', u'name', u'gender', u'currentRoster', u'seasons', u'nSeasons',
       u'rosters'],
      dtype='object')

In [26]:
sports.rosters

0     {u'2008-09': [[{'link': '/sports/m-basebl/2008...
1     {u'2008-09': [[{'link': '/sports/m-baskbl/2008...
2     {u'2008-09': [[{'link': '/sports/m-crewhvy/200...
3     {u'2012-13': [[{'link': '/sports/m-crewlt/2012...
4     {u'2012-13': [[{'link': '/sports/m-xc/2012-13/...
5     {u'2008-09': [[{'link': '/sports/m-fenc/2008-0...
6     {u'2008-09': [[{'link': '/sports/m-footbl/2008...
7     {u'2008-09': [[{'link': '/sports/m-golf/2008-0...
8     {u'2012-13': [[{'link': '/sports/m-hockey/2012...
9     {u'2012-13': [[{'link': '/sports/m-lacros/2012...
10    {u'2008-09': [[{'link': '/sports/c-sail/2008-0...
11    {u'2008-09': [[{'link': '/sports/m-soccer/2008...
12    {u'2012-13': [[{'link': '/sports/m-squash/2012...
13    {u'2012-13': [[{'link': '/sports/m-swim/2012-1...
14    {u'2012-13': [[{'link': '/sports/m-tennis/2012...
15    {u'2008-09': [[{'link': '/sports/m-track/2008-...
16    {u'2008-09': [[{'link': '/sports/w-baskbl/2008...
17    {u'2012-13': [[{'link': '/sports/w-crew/20

In [34]:
# The top entry for every table describes what metadata is available.
sports.iloc[0]['rosters']['2016-17'][0]

[{'link': '/sports/m-basebl/2016-17/roster?sort=number', 'name': u'No.'},
 {'link': '/sports/m-basebl/2016-17/roster?sort=last_name', 'name': u'Name'},
 {'link': '/sports/m-basebl/2016-17/roster?sort=position', 'name': u'Pos.'},
 {'link': '/sports/m-basebl/2016-17/roster?sort=year', 'name': u'Cl.'},
 u'B/T',
 u'Ht.',
 u'Wt.',
 u'Hometown/High School']

In [29]:
rosters = []
players = []

for i, row in sports.iterrows(): # for each sport
    sportName = row['name']
    
    for season, roster in row['rosters'].iteritems():
        print len(roster)
        nRoster = list(roster)
        nRoster['sport'] = sportName # this is the foreign key, when combined with the season
        nRoster['season'] = season
        roster.append(nRoster)
        
        for player in roster:
            nPlayer = list()
#         print (type(roster))
#         for player in roster:
            

27
24
29
30
30
28
24
28
30
18
17
16
16
17
16
14
16
15
16
17
18
18
16
82
48
36
38
34
43
37
44
38
33
41
53
40
42
38
47
26
25
26
20
24
23
24
19
19
17
17
17
4
19
21
18
17
117
116
87
119
120
106
123
108
110
126
112
104
96
105
12
15
13
12
10
11
16
10
13
13
11
11
12
12
26
29
28
28
28
27
27
29
38
42
48
39
44
42
42
42
12
20
27
26
20
29
34
28
30
29
29
25
27
24
28
25
28
26
27
30
27
26
26
16
18
15
18
17
15
16
16
30
34
34
30
37
27
33
29
13
13
14
11
13
13
13
13
53
48
51
51
47
47
46
52
43
15
15
14
15
14
14
14
15
16
15
16
14
15
15
40
40
38
40
38
39
40
36
24
28
26
30
23
31
28
31
24
27
33
30
22
26
20
18
17
15
21
15
19
17
18
20
19
22
21
20
20
21
21
9
8
10
9
10
11
10
12
9
11
11
10
12
10
14
16
14
15
13
14
15
19
12
16
15
14
15
12
24
23
25
24
22
25
23
22
25
30
34
32
28
33
25
38
27
23
20
17
20
19
25
20
26
26
25
27
28
21
31
26
30
24
28
31
24
26
23
17
17
21
21
19
19
16
19
17
16
15
19
13
17
14
29
33
36
32
36
25
30
23
9
11
13
10
14
9
9
10
55
45
48
43
50
45
45
18
14
14
17
17
16
16
18
14
18
19
14
16
10


In [7]:
rosters

(33L,)