In [45]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import html5lib
import lxml

In [72]:
class Player:
    
    def __init__(self, name):
        self.name = name
        self.url = self.gen_url()
        self.stats = self.career_stats()
        self.soup = self.gen_soup()
       
    def gen_soup(self):
        page = requests.get(self.url)
        soup = BeautifulSoup(page.text, 'html.parser')
        return soup
        
    def gen_url(self):
        name = self.name.lower()
        string = name.split()
        firstInitial = string[1][0]
        firstName = string[0][:2]
        lastName = string[1][:5]
        playerID = lastName + firstName + '01'
        url = 'https://www.baseball-reference.com/players/{}/{}.shtml'.format(firstInitial, playerID)
        return url
    
    def career_stats(self):
        page = requests.get(self.url)
        soup = BeautifulSoup(page.text, 'html.parser')
        attrs = {'class':re.compile('poptip')}
        cols = [col.get_text() for col in soup.find_all('th', {'class':re.compile('poptip')})]
        stats = []
        for season in soup.find_all('tr', {'class': 'full'}):
            temp = [season.find('th').get_text()]
            temp += [stat.get_text() for stat in season.find_all('td')]
            stats += [temp]
        totals = soup.find('tfoot')
        for row in totals.find_all('tr'):
            temp = []
            for stat in row:
                temp += [stat.get_text()]
                temp1 = [temp[0]] + ['','',''] + temp[1:]
            stats += [temp1]
        return stats

    def stats_by_year(self, year):
        for season in self.stats:
            if season[0] == year:
                return season
        return 'No stats found for that year'
    
    def gen_stats_table(self):
        cols = [col.get_text() for col in self.soup.find_all('th', {'class':re.compile('poptip')})]
        df = pd.DataFrame(columns=cols)
        for season in self.stats:
            temp_df = pd.DataFrame(season).transpose()
            temp_df.columns = cols
            df = pd.concat([df, temp_df], ignore_index=True)
        pd.set_option('display.max_columns', None)
        return df

In [73]:
judge = Player('Aaron Judge')
judge.gen_stats_table()

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,2016,24.0,NYY,AL,27,95,84,10,15,2,0,4,10,0,1,9,42,0.179,0.263,0.345,0.608,61,29,2,1,0,1,0,9,
1,2017,25.0,NYY,AL,155,678,542,128,154,24,3,52,114,9,4,127,208,0.284,0.422,0.627,1.049,171,340,15,5,0,4,11,*9D,"AS,MVP-2,RoY-1,SS"
2,2018,26.0,NYY,AL,112,498,413,77,115,22,0,27,67,6,3,76,152,0.278,0.392,0.528,0.919,151,218,10,4,0,5,3,9D/8,"AS,MVP-12"
3,2019,27.0,NYY,AL,50,222,183,33,52,7,0,11,28,2,1,36,66,0.284,0.403,0.503,0.905,141,92,8,1,0,1,3,9/D,
4,4 Yrs,,,,344,1493,1222,248,336,55,3,94,219,17,9,248,468,0.275,0.399,0.556,0.954,153,679,35,11,0,11,17,,
5,162 Game Avg.,,,,162,703,575,117,158,26,1,44,103,8,4,117,220,0.275,0.399,0.556,0.954,153,320,16,5,0,5,8,,


In [32]:
colon = Player('Bartolo Colon')
colon.gen_stats_table()

Unnamed: 0,Year,Age,Tm,Lg,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,Awards
0,1997,24,CLE,AL,4,7,0.364,5.65,19,17,0,1,0,0,94.0,107,66,59,12,45,1,66,3,0,5,427,83,4.9,1.617,10.2,1.1,4.3,6.3,1.47,
1,1998,25,CLE,AL,14,9,0.609,3.71,31,31,0,6,2,0,204.0,205,91,84,15,79,5,158,3,0,4,883,128,3.75,1.392,9.0,0.7,3.5,7.0,2.0,AS
2,1999,26,CLE,AL,18,5,0.783,3.95,32,32,0,1,1,0,205.0,185,97,90,24,76,5,161,7,0,4,858,126,4.3,1.273,8.1,1.1,3.3,7.1,2.12,CYA-4
3,2000,27,CLE,AL,15,8,0.652,3.88,30,30,0,2,1,0,188.0,163,86,81,21,98,4,212,4,0,4,807,127,3.96,1.388,7.8,1.0,4.7,10.1,2.16,
4,2001,28,CLE,AL,14,12,0.538,4.09,34,34,0,1,0,0,222.1,220,106,101,26,90,2,201,2,1,4,947,110,4.0,1.394,8.9,1.1,3.6,8.1,2.23,
5,2002,29,TOT,MLB,20,8,0.714,2.93,33,33,0,8,3,0,233.1,219,85,76,20,70,5,149,2,0,4,966,147,3.72,1.239,8.4,0.8,2.7,5.7,2.13,CYA-6
6,2003,30,CHW,AL,15,13,0.536,3.87,34,34,0,9,0,0,242.0,223,107,104,30,67,3,173,5,3,8,984,120,4.11,1.198,8.3,1.1,2.5,6.4,2.58,
7,2004,31,ANA,AL,18,12,0.6,5.01,34,34,0,0,0,0,208.1,215,122,116,38,71,1,158,3,0,1,897,89,4.97,1.373,9.3,1.6,3.1,6.8,2.23,
8,2005,32,LAA,AL,21,8,0.724,3.48,33,33,0,2,0,0,222.2,215,93,86,26,43,0,157,3,1,2,906,122,3.75,1.159,8.7,1.1,1.7,6.3,3.65,"AS,CYA-1,MVP-23"
9,2006,33,LAA,AL,1,5,0.167,5.11,10,10,0,1,1,0,56.1,71,39,32,11,11,0,31,3,0,1,251,89,5.33,1.456,11.3,1.8,1.8,5.0,2.82,


In [33]:
trout = Player('Mike Trout')
trout.gen_stats_table()

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,2011,19,LAA,AL,40,135,123,20,27,6,0,5,16,4,0,9,30,0.22,0.281,0.39,0.672,89,48,2,2,0,1,0,897/D,
1,2012,20,LAA,AL,139,639,559,129,182,27,8,30,83,49,5,67,139,0.326,0.399,0.564,0.963,168,315,7,6,0,7,4,*87/9,"AS,MVP-2,RoY-1,SS"
2,2013,21,LAA,AL,157,716,589,109,190,39,9,27,97,33,7,110,136,0.323,0.432,0.557,0.988,179,328,8,9,0,8,10,*87/D,"AS,MVP-2,SS"
3,2014,22,LAA,AL,157,705,602,115,173,39,9,36,111,16,2,83,184,0.287,0.377,0.561,0.939,168,338,6,10,0,10,6,*8/D,"AS,MVP-1,SS"
4,2015,23,LAA,AL,159,682,575,104,172,32,6,41,90,11,7,92,158,0.299,0.402,0.59,0.991,176,339,11,10,0,5,14,*8/D,"AS,MVP-2,SS"
5,2016,24,LAA,AL,159,681,549,123,173,32,5,29,100,30,7,116,137,0.315,0.441,0.55,0.991,172,302,5,11,0,5,12,*8D,"AS,MVP-1,SS"
6,2017,25,LAA,AL,114,507,402,92,123,25,3,33,72,22,4,94,90,0.306,0.442,0.629,1.071,186,253,8,7,0,4,15,8/D,"AS,MVP-4"
7,2018,26,LAA,AL,140,608,471,101,147,24,4,39,79,24,2,122,124,0.312,0.46,0.628,1.088,198,296,5,10,0,4,25,*8D,"AS,MVP-2,SS"
8,2019,27,LAA,AL,101,454,356,82,106,24,1,34,85,8,2,82,82,0.298,0.441,0.657,1.098,187,234,5,12,0,4,11,*8/D,AS


In [34]:
yelich = Player("Christian Yelich")
yelich.gen_stats_table()

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,2013,21,MIA,NL,62,273,240,34,69,12,1,4,16,10,0,31,66,0.288,0.37,0.396,0.766,112,95,4,1,0,1,1,7/8,
1,2014,22,MIA,NL,144,660,582,94,165,30,6,9,54,21,7,70,137,0.284,0.362,0.402,0.764,115,234,9,3,3,2,3,*78,GG
2,2015,23,MIA,NL,126,525,476,63,143,30,2,7,44,16,5,47,101,0.3,0.366,0.416,0.782,118,198,13,2,0,0,2,*78,
3,2016,24,MIA,NL,155,659,578,78,172,38,3,21,98,9,4,72,138,0.298,0.376,0.483,0.859,135,279,20,4,0,5,4,*78/D,"MVP-19,SS"
4,2017,25,MIA,NL,156,695,602,100,170,36,2,18,81,16,2,80,137,0.282,0.369,0.439,0.807,120,264,13,6,0,6,4,*8,
5,2018,26,MIL,NL,147,651,574,118,187,34,7,36,110,22,4,68,135,0.326,0.402,0.598,1.0,164,343,14,7,0,2,2,*798,"AS,MVP-1,SS"
6,2019,27,MIL,NL,97,433,365,78,123,21,3,36,80,23,2,59,82,0.337,0.434,0.707,1.141,188,258,7,6,0,3,15,*9/7D8,AS


In [55]:
mcneil = Player('Jeff McNeil')
mcneil.gen_stats_table()

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,2018,26,NYM,NL,63,248,225,35,74,11,6,3,19,7,1,14,24,0.329,0.381,0.471,0.852,138,106,2,5,4,0,1,4/5,RoY-6
1,2019,27,NYM,NL,91,380,345,52,116,27,1,10,44,4,5,20,50,0.336,0.397,0.507,0.905,142,175,3,15,0,0,2,7945,AS


In [53]:
t[['Year','Age']] = 32,3
t

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,32,3,NYM,NL,63,248,225,35,74,11,6,3,19,7,1,14,24,0.329,0.381,0.471,0.852,138,106,2,5,4,0,1,4/5,RoY-6
1,32,3,NYM,NL,91,380,345,52,116,27,1,10,44,4,5,20,50,0.336,0.397,0.507,0.905,142,175,3,15,0,0,2,7945,AS


In [56]:
stats = colon.stats
totals = colon.soup.find('tfoot')
stats = []
for row in totals.find_all('tr'):
    temp = [row.find('th').get_text()]
    temp += [stat.get_text() for stat in row.find_all('td')]
    stats += [temp]
stats

[['21 Yrs',
  '247',
  '188',
  '.568',
  '4.12',
  '565',
  '552',
  '2',
  '38',
  '13',
  '0',
  '3461.2',
  '3593',
  '1719',
  '1584',
  '439',
  '948',
  '47',
  '2535',
  '64',
  '6',
  '45',
  '14655',
  '106',
  '4.15',
  '1.312',
  '9.3',
  '1.1',
  '2.5',
  '6.6',
  '2.67',
  ''],
 ['162 Game Avg.',
  '15',
  '11',
  '.568',
  '4.12',
  '34',
  '34',
  '0',
  '2',
  '1',
  '0',
  '211 ',
  '219',
  '105',
  '96',
  '27',
  '58',
  '3',
  '154',
  '4',
  '0',
  '3',
  '892',
  '106',
  '4.15',
  '1.312',
  '9.3',
  '1.1',
  '2.5',
  '6.6',
  '2.67',
  ''],
 [''],
 ['CLE (6 yrs)',
  '75',
  '45',
  '.625',
  '3.92',
  '162',
  '160',
  '0',
  '15',
  '6',
  '0',
  '1029.2',
  '984',
  '483',
  '448',
  '109',
  '419',
  '18',
  '873',
  '21',
  '1',
  '24',
  '4389',
  '121',
  '4.06',
  '1.363',
  '8.6',
  '1.0',
  '3.7',
  '7.6',
  '2.08',
  ''],
 ['LAA (4 yrs)',
  '46',
  '33',
  '.582',
  '4.66',
  '96',
  '95',
  '0',
  '3',
  '1',
  '0',
  '586.2',
  '633',
  '328',
  '3

In [61]:
cols = [col.get_text() for col in colon.soup.find_all('th', {'class':re.compile('poptip')})]
df = pd.DataFrame(columns=cols)
for season in stats:
    temp_df = pd.DataFrame(columns=cols)
    temp_df[['Year','Age','Tm','Lg']] = season[0],'','',''
    temp_df = pd.DataFrame(season[1:]).transpose()
    temp_df.columns = cols
    df = pd.concat([df, temp_df], ignore_index=True)
    return temp_df
pd.set_option('display.max_columns', None)

SyntaxError: 'return' outside function (<ipython-input-61-f8ee25264add>, line 9)

In [64]:
season
cols

['Year',
 'Age',
 'Tm',
 'Lg',
 'W',
 'L',
 'W-L%',
 'ERA',
 'G',
 'GS',
 'GF',
 'CG',
 'SHO',
 'SV',
 'IP',
 'H',
 'R',
 'ER',
 'HR',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'BK',
 'WP',
 'BF',
 'ERA+',
 'FIP',
 'WHIP',
 'H9',
 'HR9',
 'BB9',
 'SO9',
 'SO/W',
 'Awards']

In [71]:
temp_df = pd.DataFrame(columns=cols)
temp_df[['Year','Age','Tm','Lg']] = season[0],'','',''
season = [season[0]] + ['','',''] + season[1:]
temp_df = pd.DataFrame(season).transpose()
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
0,21 Yrs,,,,,,,,,,247,188,0.568,4.12,565,552,2,38,13,0,3461.2,3593,1719,1584,439,948,47,2535,64,6,45,14655,106,4.15,1.312,9.3,1.1,2.5,6.6,2.67,


In [None]:
for stat in stat:
    