In [1]:
import math
import pandas as pd
import re
import urllib.request
    

In [2]:
draft_url_format = 'http://dolphinsim.com/clmanager/epbl/year{0:04d}.txt'
draft_regex = re.compile('([0-9]+)\. ([a-z,A-Z,\s]*) draft ([0-9]+) ([A-Z]\.[A-Z][a-z,A-Z]+) \([A-Z]*.*\)<br>')
draft_list = {}
for season in range(58,90):
    print('Season {0:d}'.format(season))
    draft_url = draft_url_format.format(season)
    response = urllib.request.urlopen(draft_url)
    page = response.read()
    response.close()
    lines = page.decode("utf-8").split('\n')
    for line in lines:
        draft_matches = draft_regex.match(line)
        if draft_matches is not None:
            player_id = draft_matches.group(3)
            player_name = draft_matches.group(4)
            team_name = draft_matches.group(2)
            if team_name == 'Anaheim Angels':
                team_name = 'Anaheim'
            elif team_name == 'Arizona Diamondbacks':
                team_name = 'Arizona'
            elif team_name == 'Detroit Tigers':
                team_name = 'Detroit'
            draft_position = draft_matches.group(1)
            draft_round = math.ceil(int(draft_position)/30)
            round_position = int(draft_position) - (draft_round - 1) * 30
            
            if player_id not in draft_list.keys():
                draft_list[player_id] = {}
            draft_list[player_id]['player_name'] = player_name
            draft_list[player_id]['team_name'] = team_name
            draft_list[player_id]['draft_position'] = draft_position
            draft_list[player_id]['draft_round'] = draft_round
            draft_list[player_id]['round_position'] = round_position
            draft_list[player_id]['draft_season'] = season
            

Season 58
Season 59
Season 60
Season 61
Season 62
Season 63
Season 64
Season 65
Season 66
Season 67
Season 68
Season 69
Season 70
Season 71
Season 72
Season 73
Season 74
Season 75
Season 76
Season 77
Season 78
Season 79
Season 80
Season 81
Season 82
Season 83
Season 84
Season 85
Season 86
Season 87
Season 88
Season 89


In [3]:
season_counts = {}
round_counts = {}
team_counts = {}

for player_id in draft_list.keys():
    if draft_list[player_id]['draft_season'] not in season_counts.keys():
        season_counts[draft_list[player_id]['draft_season']] = 1
    else:
        season_counts[draft_list[player_id]['draft_season']] += 1
        
    if draft_list[player_id]['draft_round'] not in round_counts.keys():
        round_counts[draft_list[player_id]['draft_round']] = 1
    else:
        round_counts[draft_list[player_id]['draft_round']] += 1
        
    if draft_list[player_id]['team_name'] not in team_counts.keys():
        team_counts[draft_list[player_id]['team_name']] = 1
    else:
        team_counts[draft_list[player_id]['team_name']] += 1
    
    #print('{} {} drafted in season {} with pick {} of round {} by {}'.format(player_id, draft_list[player_id]['player_name'],
    #                                                                        draft_list[player_id]['draft_season'], 
    #                                                                        draft_list[player_id]['round_position'],
    #                                                                        draft_list[player_id]['draft_round'],
    #                                                                        draft_list[player_id]['team_name']))
    
print('Season counts:')
for season in sorted(season_counts.keys()):
    print('{} from season {}'.format(season_counts[season], season))

print('\nRound counts:')
for draft_round in sorted(round_counts.keys()):
    print('{} from round {}'.format(round_counts[draft_round], draft_round))

print('\nTeam counts:')
for team_name in sorted(team_counts.keys()):
    print('{} from team {}'.format(team_counts[team_name], team_name))

Season counts:
1 from season 59
5 from season 63
5 from season 64
3 from season 66
11 from season 67
5 from season 68
5 from season 69
10 from season 70
27 from season 71
26 from season 72
36 from season 73
50 from season 74
50 from season 75
80 from season 76
77 from season 77
77 from season 78
97 from season 79
149 from season 80
210 from season 81
299 from season 82
385 from season 83
466 from season 84
584 from season 85
661 from season 86
900 from season 87
1376 from season 88
1500 from season 89

Round counts:
409 from round 1
345 from round 2
317 from round 3
267 from round 4
273 from round 5
250 from round 6
252 from round 7
210 from round 8
212 from round 9
214 from round 10
200 from round 11
188 from round 12
179 from round 13
167 from round 14
172 from round 15
177 from round 16
155 from round 17
144 from round 18
152 from round 19
145 from round 20
135 from round 21
127 from round 22
133 from round 23
118 from round 24
122 from round 25
104 from round 26
109 from round 27
9

In [21]:
team_url_format = 'http://dolphinsim.com/clmanager/epbl/ros{0:03d}.htm'
words_regex = re.compile('\s+')
hist_url_regex = re.compile('.*<a href=\"(.*)\#([0-9]+)\">(.*)</a>.*')
age_regex = re.compile('([0-9]{2})r*')
team_by_level = {}    

class Player:
    def __init__(self, team, line):
        line = line.strip()
        url_matches = hist_url_regex.match(line)
        age_col = 3
        pos_col = 2
        name_col = 1
        id_col = 0
        attrs = words_regex.split(line)
        if url_matches is not None:
            self.history_url = url_matches.group(1)
            self.player_id = url_matches.group(2)
            self.name = url_matches.group(3)
            age_col += 1
            pos_col += 1
        else:
            self.history_url = None
            self.player_id = attrs[id_col].strip()
            self.name = attrs[name_col].strip()
            
        self.team = team
        self.age = 0
        self.draft_season = draft_list[self.player_id]['draft_season']
        self.draft_round = draft_list[self.player_id]['draft_round']
        self.draft_position = draft_list[self.player_id]['draft_position']
        self.draft_round_position = draft_list[self.player_id]['round_position']
        self.draft_team = draft_list[self.player_id]['team_name']
        if self.draft_team not in team_by_level.keys():
            team_by_level[self.draft_team] = {}
        if team.level not in team_by_level[self.draft_team].keys():
            team_by_level[self.draft_team][team.level] = 0
        team_by_level[self.draft_team][team.level] += 1
        self.position = attrs[pos_col].strip()
        age_matches = age_regex.match(attrs[age_col].strip())
        if age_matches is not None:
            self.age = age_matches.group(1)
            
    def __str__(self):
        draft_text = ' drafted by {0} in {1}, pick {2} of round {3}'.format(self.draft_team, self.draft_season, 
                                                                            self.draft_round_position, self.draft_round)
        if self.history_url is not None:
            return '<a href="{0}#{1}">{1}: {2}</a> {3} {4} {5}'.format(self.history_url, 
                                                                   self.player_id, self.name, 
                                                                   self.position, self.age,
                                                                   draft_text)
        return '{0}: {1} {2} {3} {4}'.format(self.player_id, self.name, self.position, self.age,
                                                                   draft_text)
    

class Team:
    def __init__(self, team_id, level='majors', parent='None'):
        self.team_id = team_id
        self.url = team_url_format.format(team_id)
        self.level = level
        print('{0}:{1} {2}'.format(team_id, level, self.url))
        self.parent = parent
        self.players = {}
        self.read_roster()
        if level == 'majors':
            self.aaa = Team(team_id + 30, 'AAA', self)
            self.aa = Team(team_id + 60, 'AA', self)
            self.hia = Team(team_id + 90, 'hi-A', self)
            self.a = Team(team_id + 120, 'A', self)
            self.hir = Team(team_id + 150, 'hi-R', self)
            self.r = Team(team_id + 180, 'R', self)
        
    def __str__(self):
        ret_str = '{0} ({1}): {2}'.format(self.name, self.level, self.url)
        if self.level == 'majors':
            ret_str += '\n\t{0}'.format(self.aaa.__str__())
            ret_str += '\n\t{0}'.format(self.aa.__str__())
            ret_str += '\n\t{0}'.format(self.hia.__str__())
            ret_str += '\n\t{0}'.format(self.a.__str__())
            ret_str += '\n\t{0}'.format(self.hir.__str__())
            ret_str += '\n\t{0}'.format(self.r.__str__())
        return ret_str
    
    def read_roster(self):
        response = urllib.request.urlopen(self.url)
        page = response.read()
        response.close()
        self.set_team_name(page)
        blocks = page.__str__().split('<pre>')
        block = 4
        if self.level != 'majors':
            block -= 1
        if self.level[:-1] == 'r':
            block -= 1
        attr_block = blocks[block].split('</pre>')[0]
        print(self.name)
        #print(self.name + '\n' + '\n'.join(attr_block.split('\\n')[:-1]) + '\n\n')
        self.process_block(attr_block)
        #print('\n')
        
    def process_block(self, block):
        lines = block.split('\\n')
        header = words_regex.split(lines[0])
        for line in lines[1:]:
            if len(line.strip()) == 0:
                continue
            player = Player(self, line)
            #print(player)
            self.players[player.player_id] = player
        
    def set_team_name(self, page):
        title = page.__str__().split('</title>')[0]
        title = title.split('<title>\\n')[1]
        self.name = title

            
teams = {}
for team_id in range(0,30):
    team = Team(team_id)
    teams[team.name] = team
    #print(team)
    #break
    



0:majors http://dolphinsim.com/clmanager/epbl/ros000.htm
Arizona
30:AAA http://dolphinsim.com/clmanager/epbl/ros030.htm
Arizona
60:AA http://dolphinsim.com/clmanager/epbl/ros060.htm
Arizona
90:hi-A http://dolphinsim.com/clmanager/epbl/ros090.htm
Arizona
120:A http://dolphinsim.com/clmanager/epbl/ros120.htm
Arizona
150:hi-R http://dolphinsim.com/clmanager/epbl/ros150.htm
Arizona
180:R http://dolphinsim.com/clmanager/epbl/ros180.htm
Arizona
1:majors http://dolphinsim.com/clmanager/epbl/ros001.htm
Los Angeles
31:AAA http://dolphinsim.com/clmanager/epbl/ros031.htm
Las Vegas
61:AA http://dolphinsim.com/clmanager/epbl/ros061.htm
Jacksonville
91:hi-A http://dolphinsim.com/clmanager/epbl/ros091.htm
Inland Empire
121:A http://dolphinsim.com/clmanager/epbl/ros121.htm
Great Lakes
151:hi-R http://dolphinsim.com/clmanager/epbl/ros151.htm
Ogden
181:R http://dolphinsim.com/clmanager/epbl/ros181.htm
Los Angeles-R
2:majors http://dolphinsim.com/clmanager/epbl/ros002.htm
San Francisco
32:AAA http://dolp

AttributeError: 'str' object has no attribute 'name'

In [24]:
for level in ['majors', 'AAA', 'AA', 'hi-A', 'A', 'hi-R', 'R']:
    for team in sorted(teams.keys()):
        print('{0} - {1}: {2:d}'.format(team, level, team_by_level[team][level]))

Anaheim - majors: 31
Arizona - majors: 25
Atlanta - majors: 35
Baltimore - majors: 26
Boston - majors: 20
Chicago AL - majors: 40
Chicago NL - majors: 20
Cincinnati - majors: 27
Cleveland - majors: 27
Colorado - majors: 29
Detroit - majors: 36
Florida - majors: 31
Houston - majors: 24
Kansas City - majors: 26
Los Angeles - majors: 24
Milwaukee - majors: 21
Minnesota - majors: 15
New York AL - majors: 23
New York NL - majors: 27
Oakland - majors: 23
Philadelphia - majors: 37
Pittsburgh - majors: 17
San Diego - majors: 30
San Francisco - majors: 26
Seattle - majors: 29
St Louis - majors: 34
Tampa Bay - majors: 22
Texas - majors: 15
Toronto - majors: 18
Washington - majors: 21
Anaheim - AAA: 21
Arizona - AAA: 34
Atlanta - AAA: 25
Baltimore - AAA: 16
Boston - AAA: 26
Chicago AL - AAA: 31
Chicago NL - AAA: 14
Cincinnati - AAA: 32
Cleveland - AAA: 15
Colorado - AAA: 29
Detroit - AAA: 31
Florida - AAA: 16
Houston - AAA: 15
Kansas City - AAA: 29
Los Angeles - AAA: 23
Milwaukee - AAA: 18
Minnes