In [None]:
import os
import pandas as pd
import time
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [None]:
def stats_scrape(year):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    html = urlopen(url)
    soup = BeautifulSoup(html,features="html")
    soup.findAll('tr', limit=1)
    headers = [th.getText() for th in soup.findAll('tr', limit=1)[0].findAll('th')]
    headers = headers[1:]
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    player_stats_df = pd.DataFrame(player_stats, columns = headers)
    player_stats_df.dropna(subset = ['Player'], inplace = True)
    player_stats_df.insert(0, "Year", [year - 1]*(len(player_stats_df.index)))
    return player_stats_df

year_list = list(range(2011,2021))
all_player_stats_df = pd.DataFrame()
for year in year_list:
    reg_season_df = stats_scrape(year)
    all_player_stats_df=pd.concat([all_player_stats_df,reg_season_df], ignore_index=True)
    time.sleep(2)
all_player_stats_df.to_csv('C:\prject\player_stats_2010_2020.csv')

In [None]:
all_player_stats_df

In [None]:
season_list = range(2010,2020)
team_short_name=['ATL','BOS','BRK','CHA','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM',
       'MIA','MIL','MIN','NJN','NOH','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS']
teams_relocate_rename_dict = {
    'NJN': ['2010', '2011', '2012'],
    'BRK': ['2013', '2014', '2015', '2016', '2017', '2018', '2019','2020'],
    'CHA': ['2010', '2011', '2012', '2013','2014'],
    'CHO': ['2015', '2016', '2017', '2018', '2019','2020'],
    'NOH': ['2010','2011','2012','2013'],
    'NOP': ['2014', '2015', '2016', '2017', '2018', '2019','2020'],
}

def sched_scrape_cleaning(team_abrv,year):
    
    url = "https://www.basketball-reference.com/teams/{}/{}_games.html". format(team_abrv,year)
    html = urlopen(url)
    soup = BeautifulSoup(html,features="html")
    soup.findAll('tr', limit=1)
    rows = soup.findAll('tr')
    sched_data = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    df = pd.DataFrame(sched_data)
    df.drop(columns = [1,2,3,6,8,9,10,11,12,13], inplace = True)
    df.columns = ['Date','Away_flag','Opponent','OT_flag']
    df.dropna(subset = ['Date'], inplace = True)
    df.reset_index(inplace = True)
    df['Game_num'] = df.index + 1
    TeamName=soup.findAll("span",itemprop="name")[2]
    df['Team'] = TeamName.text
    df['Year'] = str(int(year) - 1)
    df = df[['Team','Year', 'Game_num','Date','Away_flag','Opponent','OT_flag']]
    return df

all_teams_sched_df = pd.DataFrame()
for team in team_short_name:
    team_sched_df = pd.DataFrame(columns = ['Team','Year','Game_num','Date','Away_flag','Opponent','OT_flag'])
    if team not in teams_relocate_rename_dict:
        for year in season_list:
            single_season_df = sched_scrape_cleaning(team, year)
            team_sched_df=pd.concat([team_sched_df,single_season_df], ignore_index=True)
            time.sleep(2)
    else:    
        for year in teams_relocate_rename_dict[team]:
            single_season_df = sched_scrape_cleaning(team, year)
            team_sched_df=pd.concat([team_sched_df,single_season_df], ignore_index=True)
            time.sleep(2)
    all_teams_sched_df = pd.concat([all_teams_sched_df, team_sched_df], ignore_index=True)   
all_teams_sched_df.to_csv(r'C:\prject\all_teams_schedule_2010_2020.csv')

In [None]:
all_teams_sched_df

In [None]:
url = "https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate=2010-08-01&EndDate=2020-08-27&InjuriesChkBx=yes&PersonalChkBx=yes&Submit=Search"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
df_first_page = pd.read_html(url)
df_first_page = df_first_page[0]
df_first_page.drop([0], inplace = True)
df_first_page[3]=df_first_page[3].str[2:]
df_first_page.columns = ['Date','Team','Acquired','Player','Notes']
df_first_page.drop(['Acquired'],axis=1,inplace=True)
appended_data = df_first_page
for i in range(4,len(soup.findAll('a'))-4):
    one_a_tag = soup.findAll('a')[i]
    link = one_a_tag['href']
    download_url = 'https://www.prosportstransactions.com/basketball/Search/'+ link
    dfs = pd.read_html(download_url)
    df = dfs[0]
    df.drop([0], inplace = True)
    df[3]=df[3].str[2:]
    df.columns = ['Date','Team','Acquired','Player','Notes']
    df.drop(['Acquired'],axis=1,inplace=True)
    appended_data=pd.concat([appended_data,df], ignore_index=True)
    time.sleep(3)
appended_data.to_csv(r'C:\prject\prosportstransactions_scrape_missedgames_2010_2020.csv')