# The Truth Behind The Sophomore Slump in Baseball

The aim of the this project is to scientifically test whether the phenomenon known as "The Sophomore Slump" exists in Major League Baseball. In order to answer this question I have used the following methodology:

1. Compile a list of all historic rookie of the year winners in MLB from Baseball Reference
    - This provides us with a dataset of rookies that we know had a good rookie season (You can't have a sophomore slump if you were never good to begin with) 
2. Using the rookie of the year list, parse each player's Baseball Reference page indivdually to extract only the rookie and sophomore season statistics for each player.
3. Create a pandas dataframe that represents players and their season statistics.
4. Use descriptive statistics and visualization techniques to determine whether there is evidence in favor or against the existance of a Sophomore Slump.

In [1]:
# relevant import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
import time

# Data Exploration

In [2]:
# baseball reference rookie of year data
url = "https://www.baseball-reference.com/awards/roy.shtml"
rookies_page = requests.get(url)

In [3]:
# parsing data to determine structure
page_text = rookies_page.text
bs_obj = BeautifulSoup(page_text)
table = bs_obj.find('table')
player_name = table.find_all('td')[1].get("csk").split(',')
player_id = table.find_all('td')[1].get('data-append-csv')


In [4]:
player_id

'rodriju01'

In [5]:
player_name

['Rodríguez', 'Julio']

In [6]:
dir(BeautifulSoup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'DEFAULT_INTERESTING_STRING_TYPES',
 'ROOT_TAG_NAME',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_decode_markup',
 '_feed',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_markup_is_url',
 '_markup_resembles_filename',
 '_popToTag',
 '_should_pretty_print',
 'append',
 'childGenerator',
 'children',
 'clear',
 'decode',
 'decode_contents',
 'decompose',
 'decomposed',

In [7]:
help(BeautifulSoup.extract)

Help on function extract in module bs4.element:

extract(self, _self_index=None)
    Destructively rips this element out of the tree.
    
    :param _self_index: The location of this element in its parent's
       .contents, if known. Passing this in allows for a performance
       optimization.
    
    :return: `self`, no longer part of the tree.



In [8]:
# creting a dictionary of player names mapped to their baseball reference id
# these id's are used in the individual player webpage url's
players_dict = {}
for data in table.find_all('td'):
    if data.get('csk') == None:
        continue
    else:
        players_dict[data.get('csk')] = data.get('data-append-csv')
        
players_dict

{'Rodríguez,Julio': 'rodriju01',
 'Harris,Michael': 'harrimi04',
 'Arozarena,Randy': 'arozara01',
 'India,Jonathan': 'indiajo01',
 'Lewis,Kyle': 'lewisky01',
 'Williams,Devin': 'willide03',
 'Alvarez,Yordan': 'alvaryo01',
 'Alonso,Pete': 'alonspe01',
 'Ohtani,Shohei': 'ohtansh01',
 'Acuña,Ronald': 'acunaro01',
 'Judge,Aaron': 'judgeaa01',
 'Bellinger,Cody': 'bellico01',
 'Fulmer,Michael': 'fulmemi01',
 'Seager,Corey': 'seageco01',
 'Correa,Carlos': 'correca01',
 'Bryant,Kris': 'bryankr01',
 'Abreu,José': 'abreujo02',
 'deGrom,Jacob': 'degroja01',
 'Myers,Wil': 'myerswi01',
 'Fernández,José': 'fernajo02',
 'Trout,Mike': 'troutmi01',
 'Harper,Bryce': 'harpebr03',
 'Hellickson,Jeremy': 'hellije01',
 'Kimbrel,Craig': 'kimbrcr01',
 'Feliz,Neftalí': 'felizne01',
 'Posey,Buster': 'poseybu01',
 'Bailey,Andrew': 'bailean01',
 'Coghlan,Chris': 'coghlch01',
 'Longoria,Evan': 'longoev01',
 'Soto,Geovany': 'sotoge01',
 'Pedroia,Dustin': 'pedrodu01',
 'Braun,Ryan': 'braunry02',
 'Verlander,Justin': 

In [9]:
len(players_dict)

152

In [10]:
152/10

15.2

In [11]:
trout_page = requests.get('https://www.baseball-reference.com/players/' + players_dict['Trout,Mike'][0] + '/' + players_dict['Trout,Mike'] + '.shtml')
trout_page.text
trout_bs = BeautifulSoup(trout_page.text)
trout_bs.find_all('table')[1].find('caption').text
for table in trout_bs.find_all('table'):
    if table.find('caption').text == 'Standard Batting':
        batting_table = table

batting_table

<table class="row_summable sortable stats_table" data-cols-to-freeze="1,3" id="batting_standard">
<caption>Standard Batting</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr>
<th aria-label="Year" class="poptip sort_default_asc show_partial_when_sorting left" data-stat="year_ID" data-tip="A Star indicates an all-star that season.&lt;br&gt;A Ring indicates the player appeared in WS for winning team." scope="col">Year</th>
<th aria-label="Age" class="poptip sort_default_asc show_partial_when_sorting center" data-stat="age" data-tip="Player’s age at midnight of June 30th of that year" scope="col">Age</th>
<th aria-label="Tm" class="poptip sort_default_asc show_partial_when_sorting center" data-stat="team_ID" scope="col">Tm</th>
<th aria-label="Lg" class="poptip sort_default_asc center" data-stat="lg_ID" data-tip="&lt;

In [12]:
hellickson_page = requests.get('https://www.baseball-reference.com/players/' + players_dict['Hellickson,Jeremy'][0] + '/' + players_dict['Hellickson,Jeremy'] + '.shtml')
hellickson_page.text
hellickson_bs = BeautifulSoup(hellickson_page.text)

In [13]:
boolean = False
for item in hellickson_bs.find_all('p'):

    if "pitcher" in item.text.lower():
        boolean = True
if boolean == True:
    print("This Player is a pitcher")
else:
    print("This player is a hitter")


This Player is a pitcher


In [14]:
table_header = []
table_header.append("name")
for th in batting_table.find_all('th'):
    if th.text == "Awards":
        table_header.append(th.text)
        break
    table_header.append(th.text)
    

table_header

['name',
 'Year',
 'Age',
 'Tm',
 'Lg',
 'G',
 'PA',
 'AB',
 'R',
 'H',
 '2B',
 '3B',
 'HR',
 'RBI',
 'SB',
 'CS',
 'BB',
 'SO',
 'BA',
 'OBP',
 'SLG',
 'OPS',
 'OPS+',
 'TB',
 'GDP',
 'HBP',
 'SH',
 'SF',
 'IBB',
 'Pos',
 'Awards']

In [15]:
batting_table.find_all('tr')[6]

<tr class="full" id="batting_standard.2012"><th class="left" csk="2012" data-stat="year_ID" scope="row">2012<span class="sr_star"></span></th><td class="right" data-stat="age">20</td><td class="left" data-stat="team_ID"><a href="/teams/LAA/2012.shtml" title="Los Angeles Angels of Anaheim">LAA</a></td><td class="left" data-stat="lg_ID"><a href="/leagues/AL/2012.shtml">AL</a></td><td class="right" data-stat="G">139</td><td class="right" data-stat="PA">639</td><td class="right" data-stat="AB">559</td><td class="right" data-stat="R"><strong><em>129</em></strong></td><td class="right" data-stat="H">182</td><td class="right" data-stat="2B">27</td><td class="right" data-stat="3B">8</td><td class="right" data-stat="HR">30</td><td class="right" data-stat="RBI">83</td><td class="right" data-stat="SB"><strong><em>49</em></strong></td><td class="right" data-stat="CS">5</td><td class="right" data-stat="BB">67</td><td class="right" data-stat="SO">139</td><td class="right" data-stat="batting_avg">.32

In [16]:
i = 0
for row in batting_table.find_all('tr'):
    if "RoY-1" in row.text:
        rookie_year = row
        break
    i +=1
rookie_year
sophomore_year = batting_table.find_all('tr')[i+1]
sophomore_year

<tr class="full" id="batting_standard.2013"><th class="left" csk="2013" data-stat="year_ID" scope="row">2013<span class="sr_star"></span></th><td class="right" data-stat="age">21</td><td class="left" data-stat="team_ID"><a href="/teams/LAA/2013.shtml" title="Los Angeles Angels of Anaheim">LAA</a></td><td class="left" data-stat="lg_ID"><a href="/leagues/AL/2013.shtml">AL</a></td><td class="right" data-stat="G">157</td><td class="right" data-stat="PA">716</td><td class="right" data-stat="AB">589</td><td class="right" data-stat="R"><strong>109</strong></td><td class="right" data-stat="H">190</td><td class="right" data-stat="2B">39</td><td class="right" data-stat="3B">9</td><td class="right" data-stat="HR">27</td><td class="right" data-stat="RBI">97</td><td class="right" data-stat="SB">33</td><td class="right" data-stat="CS">7</td><td class="right" data-stat="BB"><strong>110</strong></td><td class="right" data-stat="SO">136</td><td class="right" data-stat="batting_avg">.323</td><td class="

In [17]:
rookie_year.find('th').get('csk')

'2012'

In [18]:
rookie_stats = []
rookie_stats.append("Trout, Mike")
rookie_stats.append(rookie_year.find('th').get('csk'))
for td in rookie_year.find_all('td'):
    rookie_stats.append(td.text)
rookie_stats

['Trout, Mike',
 '2012',
 '20',
 'LAA',
 'AL',
 '139',
 '639',
 '559',
 '129',
 '182',
 '27',
 '8',
 '30',
 '83',
 '49',
 '5',
 '67',
 '139',
 '.326',
 '.399',
 '.564',
 '.963',
 '168',
 '315',
 '7',
 '6',
 '0',
 '7',
 '4',
 '*87/9H',
 'AS,MVP-2,RoY-1,SS']

In [19]:
sophomore_stats = []
sophomore_stats.append("Trout, Mike")
sophomore_stats.append(sophomore_year.find('th').get('csk'))
for td in sophomore_year.find_all('td'):
    sophomore_stats.append(td.text)
sophomore_stats

['Trout, Mike',
 '2013',
 '21',
 'LAA',
 'AL',
 '157',
 '716',
 '589',
 '109',
 '190',
 '39',
 '9',
 '27',
 '97',
 '33',
 '7',
 '110',
 '136',
 '.323',
 '.432',
 '.557',
 '.988',
 '179',
 '328',
 '8',
 '9',
 '0',
 '8',
 '10',
 '*87/D',
 'AS,MVP-2,SS']

In [20]:
trout_df = pd.DataFrame(data = [rookie_stats, sophomore_stats], columns = table_header)
trout_df

Unnamed: 0,name,Year,Age,Tm,Lg,G,PA,AB,R,H,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
0,"Trout, Mike",2012,20,LAA,AL,139,639,559,129,182,...,0.963,168,315,7,6,0,7,4,*87/9H,"AS,MVP-2,RoY-1,SS"
1,"Trout, Mike",2013,21,LAA,AL,157,716,589,109,190,...,0.988,179,328,8,9,0,8,10,*87/D,"AS,MVP-2,SS"


# Building the Rookie and Sophomore Data Frame

In [21]:
# rookie_df = pd.DataFrame(columns = table_header)
# sophomore_df = pd.DataFrame(columns = table_header)
# sophomore_df

# sophomore_df.loc[0]=sophomore_stats
# sophomore_df

In [22]:
rookie_df = pd.DataFrame(columns = table_header)
sophomore_df = pd.DataFrame(columns = table_header)

count = 0

for player in players_dict:
    player_page = requests.get('https://www.baseball-reference.com/players/' + players_dict[player][0] + '/' + players_dict[player] + '.shtml')
    player_bs = BeautifulSoup(player_page.text)

    for item in player_bs.find_all('p'):
        if "pitcher" in item.text.lower():
            print(player)
    
#     try:
#         player_bs.find_all('table')[1].find('caption').text
#     except IndexError:
#         continue
#         #player is pitcher and data is not found
#     print(players_dict[player], "was found")
#     for table in player_bs.find_all('table'):
#         if table.find('caption').text == 'Standard Batting':
#             player_batting_table = table
#     player_batting_table

#     i = 0
#     for row in player_batting_table.find_all('tr'):
#         if "RoY-1" in row.text:
#             rookie_year = row
#             break
#         i +=1
#     rookie_year
#     sophomore_year = batting_table.find_all('tr')[i+1]
    
    
#     rookie_stats = []
#     rookie_stats.append(player)
#     rookie_stats.append(rookie_year.find('th').get('csk'))
#     for td in rookie_year.find_all('td'):
#         rookie_stats.append(td.text)
        
#     sophomore_stats = []
#     sophomore_stats.append(player)
#     sophomore_stats.append(sophomore_year.find('th').get('csk'))
#     for td in sophomore_year.find_all('td'):
#         sophomore_stats.append(td.text)
        
#     rookie_df.loc[count] = rookie_stats
#     sophomore_df.loc[count] = sophomore_stats
    
#     count += 1
    
    


Rodríguez,Julio
Harris,Michael
Arozarena,Randy
India,Jonathan
Lewis,Kyle
Williams,Devin
Williams,Devin
Williams,Devin
Alvarez,Yordan
Alonso,Pete
Ohtani,Shohei
Ohtani,Shohei
Ohtani,Shohei
Ohtani,Shohei


KeyboardInterrupt: 

In [23]:
html_dict = {}
for player in players_dict:
    player_page = requests.get('https://www.baseball-reference.com/players/' + players_dict[player][0] + '/' + players_dict[player] + '.shtml')

    
    html_dict[player] = player_page.text
    time.sleep(10) #must include a break in loop to avoid baseball reference rate limits


In [35]:
html_dict['Trout,Mike']



In [32]:
dumb_dict = {
    'NWA':'Easy E',
    'The Beatles':'John Lennon',
    'Nirvana':'Kurt Cobain'
}
with open('music.json', "w") as file:
    json.dump(dumb_dict, file)

In [36]:
with open("player_html.json", "w") as outfile:
    json.dump(html_dict, outfile)