In [2]:
import requests, os, datetime, json
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://www.mlssoccer.com/stats/season?year=2020&group=g"

In [3]:
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

In [4]:
soup.find_all('tr')[1].contents[3].text
soup.find_all('tr')[1].contents[0]

<td data-title="Player"><a href="/players/diego-rossi">Diego Rossi</a></td>

# Main Loop

In [4]:
# to switch to regular season, change lines 10, 75, 76, 87, 88, and file names (98, 99)
# change is mentioned there or in a comment thereby

total_of_df = None
total_gk_df = None

# For each year
for year in range(1996, 2020):
    # get rid of season_type if regular season is wanted
    url_insert = str(year) + "&season_type=PS"
    starting_urls = ["https://www.mlssoccer.com/stats/season?year={}&group=g".format(url_insert),
                    "https://www.mlssoccer.com/stats/season?year={}&group=assists".format(url_insert),
                    "https://www.mlssoccer.com/stats/season?year={}&group=fouls&sort=desc&order=YC".format(url_insert),
                    "https://www.mlssoccer.com/stats/season?year={}&group=goalkeeping".format(url_insert)]
    dfs = []
    descriptions = []
    print(year)
    
    # For each of four stats
    for url in starting_urls:
        print('Examining:  ' + url)

        r = requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        trs = soup.find_all('tr')
        trs_players = trs[1:]

        # Gets the header row
        header = []
        description = []
        for item in trs[0].contents:
            try:
                header.append(item.text)
            except: # last element is a blank string in contents
                break
            description.append(item.get('title'))

        descriptions.extend(zip(header, description)) # switch to set
        players = []
        
        # Breaks after last page
        while True:
            # For each player
            for tr_player in trs_players:
                player = []

                # For each data item
                for item in tr_player.contents:
                    try:
                        player.append(item.text)
                    except:
                        break
                players.append(player)

            # Get link to next page
            try:
                url = soup.find('link', attrs={'rel':'next'}).get('href')
            except:
                break

            # Get next page
            r = requests.get(url)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, "html.parser")
            trs = soup.find_all('tr')
            trs_players = trs[1:] # disregards the header
        # end while - all pages of a stat

        dfs.append(pd.DataFrame(players, columns=header))
    # end for - gathering stats for year
        
    # merge first 3
    x = pd.merge(dfs[0], dfs[1], how='left', on=['Player', 'POS', 'GP', 'GS', 'A'], copy=True) # 'Club',
    outfield_df = pd.merge(x, dfs[2], how='left', on=['Player', 'POS', 'GP', 'GS', 'A', 'SHTS', 'SOG', 'MINS', 'G'], copy=True) # 'Club',
    
    # shots on goal percentage column (shots on goal divided by shots)
    outfield_df['SOG%'] = (100 * outfield_df['SOG'].astype('int') / outfield_df['SHTS'].astype('int')).round(2)
    gk_df = dfs[3]

    outfield_df['Year'] = year
    gk_df['Year'] = year
    
    # ------------------------------------------
    # CHANGE -----------------------------------
    outfield_df['Season'] = 'post'
    gk_df['Season'] = 'post'

    if total_of_df is None:
        total_of_df = outfield_df.copy(deep=True)
        total_gk_df = gk_df.copy(deep=True)

    total_of_df = total_of_df.append(outfield_df, ignore_index=True)
    total_gk_df = total_gk_df.append(gk_df, ignore_index=True)
    
    
total_of_df.to_csv(os.path.join('data', 'player_stats', 'post_1996_2019_all_players.csv'), index=False)
total_gk_df.to_csv(os.path.join('data', 'player_stats', 'post_1996_2019_goalkeeper.csv'), index=False)

1996
Examining:  https://www.mlssoccer.com/stats/season?year=1996&season_type=PS&group=g
Examining:  https://www.mlssoccer.com/stats/season?year=1996&season_type=PS&group=assists
Examining:  https://www.mlssoccer.com/stats/season?year=1996&season_type=PS&group=fouls&sort=desc&order=YC
Examining:  https://www.mlssoccer.com/stats/season?year=1996&season_type=PS&group=goalkeeping
1997
Examining:  https://www.mlssoccer.com/stats/season?year=1997&season_type=PS&group=g
Examining:  https://www.mlssoccer.com/stats/season?year=1997&season_type=PS&group=assists
Examining:  https://www.mlssoccer.com/stats/season?year=1997&season_type=PS&group=fouls&sort=desc&order=YC
Examining:  https://www.mlssoccer.com/stats/season?year=1997&season_type=PS&group=goalkeeping
1998
Examining:  https://www.mlssoccer.com/stats/season?year=1998&season_type=PS&group=g
Examining:  https://www.mlssoccer.com/stats/season?year=1998&season_type=PS&group=assists
Examining:  https://www.mlssoccer.com/stats/season?year=1998&

Examining:  https://www.mlssoccer.com/stats/season?year=2017&season_type=PS&group=goalkeeping
2018
Examining:  https://www.mlssoccer.com/stats/season?year=2018&season_type=PS&group=g
Examining:  https://www.mlssoccer.com/stats/season?year=2018&season_type=PS&group=assists
Examining:  https://www.mlssoccer.com/stats/season?year=2018&season_type=PS&group=fouls&sort=desc&order=YC
Examining:  https://www.mlssoccer.com/stats/season?year=2018&season_type=PS&group=goalkeeping
2019
Examining:  https://www.mlssoccer.com/stats/season?year=2019&season_type=PS&group=g
Examining:  https://www.mlssoccer.com/stats/season?year=2019&season_type=PS&group=assists
Examining:  https://www.mlssoccer.com/stats/season?year=2019&season_type=PS&group=fouls&sort=desc&order=YC
Examining:  https://www.mlssoccer.com/stats/season?year=2019&season_type=PS&group=goalkeeping


In [12]:
# Combine regular season and post season csvs
post_all_df = pd.read_csv(os.path.join('data', 'player_stats', 'post_1996_2019_all_players.csv'))
post_gk_df = pd.read_csv(os.path.join('data', 'player_stats', 'post_1996_2019_goalkeeper.csv'))
all_df = pd.read_csv(os.path.join('data', 'player_stats', '1996_2019_all_players.csv'))
gk_df = pd.read_csv(os.path.join('data', 'player_stats', '1996_2019_goalkeeper.csv'))

gk_df['Season'] = 'reg'
all_df['Season'] = 'reg'

gk_df = gk_df.append(post_gk_df, ignore_index=True)
all_df = all_df.append(post_all_df, ignore_index=True)

#all_df.to_csv(os.path.join('data', 'player_stats', '1996_2019_all_players.csv'), index=False)
#gk_df.to_csv(os.path.join('data', 'player_stats', '1996_2019_goalkeeper.csv'), index=False)

# Testing
Old testing for just one year (should just be testing the last year now)

In [6]:
for df in dfs:
    print(len(df))

649
649
649
87


In [7]:
print(len(dfs[0]))
dfs[0].head()

649


Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,PKG/A,HmG,RdG,G/90min,SC%
0,Diego Rossi,LAFC,F,4,4,360,6,2,19,13,0,1/1,6,0,1.5,31.6
1,Ayo Akinola,TOR,F,2,2,176,5,0,9,8,1,0/0,2,3,2.56,55.6
2,Gyasi Zardes,CLB,F,4,4,342,4,0,7,5,1,0/0,1,3,1.05,57.1
3,Chris Mueller,ORL,F,5,4,351,4,0,12,8,1,0/0,3,1,1.03,33.3
4,Lucas Zelarayan,CLB,M,4,4,295,3,2,8,4,2,0/0,2,1,0.92,37.5


In [8]:
print(len(dfs[1]))
dfs[1].head()

649


Unnamed: 0,Player,Club,POS,GP,GS,A,GWA,HmA,RdA,A/90min
0,Alejandro Pozuelo,TOR,M,4,4,5,0,2,3,1.35
1,Francisco Ginella,LAFC,M,4,1,3,0,3,0,1.73
2,Latif Blessing,LAFC,F,4,4,3,0,3,0,0.75
3,Darwin Quintero,HOU,F,3,2,2,0,0,2,0.97
4,Djordje Mihailovic,CHI,M,4,2,2,0,0,2,0.9


In [9]:
print(len(dfs[2]))
dfs[2].head()

649


Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,FC,FS,OFF,YC,RC
0,Junior Moreno,DC,M,3,3,224,0,0,1,1,6,1,0,3,1
1,Jose Andres Martinez,PHI,M,3,3,234,0,0,3,1,7,4,0,3,0
2,Fabian Herbers,CHI,F,4,3,282,0,0,5,1,5,2,1,3,0
3,Jakob Glesnes,PHI,D,5,3,294,1,0,2,1,4,0,0,3,0
4,Gadi Kinda,SKC,M,4,4,322,2,2,9,3,8,8,0,3,0


In [10]:
print(len(dfs[3]))
dfs[3].head()

87


Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,PKG/A,W,L,T,ShO,W%,Sv%
0,Daniel Vega,SJ,GK,5,5,450,25,16,10,2.0,1/2,2,1,2,2,40.0,64.0
1,Stefan Frei,SEA,GK,5,5,450,21,17,4,0.8,0/0,2,1,2,2,40.0,81.0
2,Andre Blake,PHI,GK,5,5,450,34,27,7,1.4,0/0,2,1,2,1,40.0,79.4
3,Pedro Gallese,ORL,GK,5,5,450,18,13,5,1.0,0/0,2,1,2,1,40.0,72.2
4,Maxime Crepeau,VAN,GK,4,4,328,22,12,10,2.5,1/1,1,3,0,1,25.0,54.5


# Merging and Saving for One Year

In [11]:
# merge first 3
x = pd.merge(dfs[0], dfs[1], how='left', on=['Player', 'Club', 'POS', 'GP', 'GS', 'A'], copy=True)
outfield_df = pd.merge(x, dfs[2], how='left', on=['Player', 'Club', 'POS', 'GP', 'GS', 'A', 'SHTS', 'SOG', 'MINS', 'G'], copy=True)

# shots on goal percentage column (shots on goal divided by shots)
outfield_df['SOG%'] = (100 * outfield_df['SOG'].astype('int') / outfield_df['SHTS'].astype('int')).round(2)
outfield_df.head()

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,GWA,HmA,RdA,A/90min,FC,FS,OFF,YC,RC,SOG%
0,Diego Rossi,LAFC,F,4,4,360,6,2,19,13,...,1,2,0,0.5,3,7,4,0,0,68.42
1,Ayo Akinola,TOR,F,2,2,176,5,0,9,8,...,0,0,0,0.0,3,1,2,0,0,88.89
2,Gyasi Zardes,CLB,F,4,4,342,4,0,7,5,...,0,0,0,0.0,0,5,5,0,0,71.43
3,Chris Mueller,ORL,F,5,4,351,4,0,12,8,...,0,0,0,0.0,1,4,0,0,0,66.67
4,Lucas Zelarayan,CLB,M,4,4,295,3,2,8,4,...,0,0,2,0.61,2,8,0,0,0,50.0


In [12]:
gk_df = dfs[3]
gk_df.head()

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,PKG/A,W,L,T,ShO,W%,Sv%
0,Daniel Vega,SJ,GK,5,5,450,25,16,10,2.0,1/2,2,1,2,2,40.0,64.0
1,Stefan Frei,SEA,GK,5,5,450,21,17,4,0.8,0/0,2,1,2,2,40.0,81.0
2,Andre Blake,PHI,GK,5,5,450,34,27,7,1.4,0/0,2,1,2,1,40.0,79.4
3,Pedro Gallese,ORL,GK,5,5,450,18,13,5,1.0,0/0,2,1,2,1,40.0,72.2
4,Maxime Crepeau,VAN,GK,4,4,328,22,12,10,2.5,1/1,1,3,0,1,25.0,54.5


In [26]:
today = datetime.datetime.now()
new_folder = str(today.month)+"_"+str(today.day)+"_"+str(today.year)
path = os.path.join('data', 'player_stats', new_folder)
os.makedirs(path)

In [27]:
outfield_df.to_csv(os.path.join(path, 'all_players.csv'), index=False)
gk_df.to_csv(os.path.join(path, 'goalkeeper.csv'), index=False)

In [30]:
pd.read_csv(os.path.join(path, 'all_players.csv'))

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,GWA,HmA,RdA,A/90min,FC,FS,OFF,YC,RC,SOG%
0,Diego Rossi,LAFC,F,4,4,360,6,2,19,13,...,1,2,0,0.50,3,7,4,0,0,68.42
1,Ayo Akinola,TOR,F,2,2,176,5,0,9,8,...,0,0,0,0.00,3,1,2,0,0,88.89
2,Gyasi Zardes,CLB,F,4,4,342,4,0,7,5,...,0,0,0,0.00,0,5,5,0,0,71.43
3,Chris Mueller,ORL,F,5,4,351,4,0,12,8,...,0,0,0,0.00,1,4,0,0,0,66.67
4,Lucas Zelarayan,CLB,M,4,4,295,3,2,8,4,...,0,0,2,0.61,2,8,0,0,0,50.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,Danny Hoesen,SJ,F,4,2,146,0,0,8,1,...,0,0,0,0.00,2,0,1,0,0,12.50
645,Jozy Altidore,TOR,F,2,2,169,0,0,8,4,...,0,0,0,0.00,3,6,3,1,0,50.00
646,Aleksandar Katai,LA,M,2,2,117,0,0,8,0,...,0,0,0,0.00,5,2,0,2,0,0.00
647,Andre Shinyashiki,COL,F,3,3,216,0,0,9,4,...,0,0,0,0.00,2,3,2,1,0,44.44


In [None]:
# postseason - just two csvs

In [64]:
set(descriptions);

In [67]:
f = open('column_descriptions.json', 'w')
json.dump(list(set(descriptions)), f)
f.close()