In [11]:
import urllib.request as urllib
from bs4 import BeautifulSoup
import pandas as pd
from random import randint
from time import sleep

# DATE VARIABLES MUST BE YYYY-MM-DD
STARTDATE = '2021-11-08'
ENDDATE = '2021-11-10' # up to, not including
CSV_TITLE = 'test_data_2020.csv'

# Put points into buckets of: 
BUCKET_BY = 5

#Date Stuff
DAYS_PER_MONTH = {'01': 31, '02': 28, '03': 31, '04': 30, '05': 31, '06': 30, '07': 31, '08': 31, '09': 30, 
                  '10': 31, '11': 30, '12': 31}
DAYS_PER_MONTH_leap = {'01': 31, '02': 29, '03': 31, '04': 30, '05': 31, '06': 30, '07': 31, '08': 31, '09': 30,
                       '10': 31,'11': 30, '12': 31}
LEAP_YEAR = ['2020']

In [12]:
teams = ['Atlanta', 'Boston', 'Brooklyn', 'Charlotte', 'Chicago', 'Cleveland', 'Dallas', 'Denver', 'Detroit', 
         'Golden State', 'Houston', 'Indiana', 'LA Clippers', 'LA Lakers', 'Memphis', 'Miami', 'Milwaukee', 
         'Minnesota', 'New Orleans', 'New York', 'Oklahoma City', 'Orlando', 'Philadelphia', 'Phoenix', 'Portland', 
         'Sacramento', 'San Antonio', 'Toronto', 'Utah', 'Washington']
ids = [x for x in range(1,31)]
TEAM_IDS = dict(zip(teams, ids))

In [13]:
def get_date_array():
    temp_date = STARTDATE
    dates = []
    while temp_date != ENDDATE:
        dates.append(temp_date)
        year, month, day = temp_date.split('-')
        # account for leap_years
        if year in LEAP_YEAR:
            days = DAYS_PER_MONTH_leap
        else:
            days = DAYS_PER_MONTH
        # move to next date
        if days[month] > int(day):
            temp_date = '%s-%s-%02d' % (year, month, int(day)+1)
        else:
            if int(month) >= 12: #==
                temp_date = '%s-01-01' % str(int(year)+1)
            else:
                temp_date = '%s-%02d-01' % (year, int(month)+1)
    return dates

In [14]:
def reformat_dates(date_array):
    ret = []
    for date in date_array:
        parts = date.split('-')
        ret.append('month=%s&day=%s&year=%s' % (parts[1], parts[2], parts[0]))
    return ret     

In [62]:
def get_finals():
    columns = ['Team_x', 'Points_x', 'Team_y', 'Points_y', 'Date', 'Home', 'Winner']
    dates = get_date_array()
    dates_for_url = reformat_dates(dates)
    ret = []
    url_finals = 'https://www.basketball-reference.com/boxscores/?'
    for i in range(len(dates_for_url)):
        print('Scraping for: %s' % dates[i])
        html = urllib.urlopen(url_finals+dates_for_url[i])
        soup = BeautifulSoup(html.read())
        html.close()
        scores = soup.find_all('table', class_='teams')
        for score in scores:
            table = score.get_text()
            team1 = table.split('\n')[3:5]
            team2 = table.split('\n')[10:12]
            home_team = team2[0]
            if int(team1[1]) > int(team2[1]):
                winner = team1[0]
            else:
                winner = team2[0]
            
            temp = team1+team2
            temp.append(dates[i])
            temp.append(home_team)
            temp.append(winner)
            ret.append(temp)
        sleep(randint(1,3))
    #print(ret)
    final = pd.DataFrame(ret, columns=columns)
    return final

In [63]:
finals = get_finals()
finals

Scraping for: 2021-11-08
Scraping for: 2021-11-09


Unnamed: 0,Team_x,Points_x,Team_y,Points_y,Date,Home,Winner
0,Brooklyn,95,Chicago,118,2021-11-08,Chicago,Chicago
1,New Orleans,92,Dallas,108,2021-11-08,Dallas,Dallas
2,Miami,96,Denver,113,2021-11-08,Denver,Denver
3,Atlanta,113,Golden State,127,2021-11-08,Golden State,Golden State
4,Charlotte,123,LA Lakers,126,2021-11-08,LA Lakers,LA Lakers
5,Minnesota,118,Memphis,125,2021-11-08,Memphis,Memphis
6,New York,103,Philadelphia,96,2021-11-08,Philadelphia,New York
7,Phoenix,109,Sacramento,104,2021-11-08,Sacramento,Phoenix
8,Portland,109,LA Clippers,117,2021-11-09,LA Clippers,LA Clippers
9,Milwaukee,118,Philadelphia,109,2021-11-09,Philadelphia,Milwaukee


In [64]:
def fix_okc(name):
    if name == 'Okla City':
        return 'Oklahoma City'
    return name

In [65]:
def scrape_data_averages_for():
   
    url = 'https://www.teamrankings.com/nba/stat/points-per-game?date='
    columns = ['Team', 'Season Ave', '3 Game Ave', 'Last Game Ave', 'Home Ave', 'Away Ave', 'Last Season Ave', 'Date']
    float_vals = ['Season Ave', '3 Game Ave', 'Last Game Ave', 'Home Ave', 'Away Ave', 'Last Season Ave']
    temp = []
    for date_str in get_date_array():
        if date_str in finals.Date.tolist():
            print('Scraping for: %s' % date_str)
            html = urllib.urlopen(url + date_str)
            soup = BeautifulSoup(html.read())
            html.close()
            rows = soup.find_all('tr')
            for row in rows[1:]:
                line=[]
                row = row.get_text()
                line = row.split('\n')
                for c in range(len(line)):
                    if line[c] == '--':
                        line[c] = '-99'
                line = line[2:len(line)-1]
                line.append(date_str)
                temp.append(line)
            sleep(randint(1,3))
    ret = pd.DataFrame(temp, columns = columns)
    ret[float_vals] = ret[float_vals].astype(float)
    ret['Team'] = ret['Team'].map(fix_okc)
    ret= ret.replace({-99.0: 'NaN'})
    return ret.drop("Last Season Ave", axis=1)

In [66]:
def scrape_data_averages_against():
   
    url = 'https://www.teamrankings.com/nba/stat/opponent-points-per-game?date='
    columns = ['Team', 'Def Season Ave', 'Def 3 Game Ave', 'Def Last Game Ave', 'Def Home Ave', 'Def Away Ave', 'Def Last Season Ave', 'Date']
    float_vals = ['Def Season Ave', 'Def 3 Game Ave', 'Def Last Game Ave', 'Def Home Ave', 'Def Away Ave', 'Def Last Season Ave']
    temp = []
    for date_str in get_date_array():
        if date_str in finals.Date.tolist():
            print('Scraping for: %s' % date_str)
            html = urllib.urlopen(url + date_str)
            soup = BeautifulSoup(html.read())
            html.close()
            rows = soup.find_all('tr')
            for row in rows[1:]:
                line=[]
                row = row.get_text()
                line = row.split('\n')
                for c in range(len(line)):
                    if line[c] == '--':
                        line[c] = '-99'
                line = line[2:len(line)-1]
                line.append(date_str)
                temp.append(line)
            sleep(randint(1,3))
    ret = pd.DataFrame(temp, columns = columns)
    ret[float_vals] = ret[float_vals].astype(float)
    ret['Team'] = ret['Team'].map(fix_okc)
    ret= ret.replace({-99.0: 'NaN'})
    return ret.drop("Def Last Season Ave", axis=1)

In [67]:
averages_for = scrape_data_averages_for()
averages_against = scrape_data_averages_against()

Scraping for: 2021-11-08
Scraping for: 2021-11-09
Scraping for: 2021-11-08
Scraping for: 2021-11-09


In [68]:
averages_for.head()

Unnamed: 0,Team,Season Ave,3 Game Ave,Last Game Ave,Home Ave,Away Ave,Date
0,Golden State,113.9,120.0,120.0,113.2,115.3,2021-11-08
1,Charlotte,113.5,102.7,106.0,121.8,108.7,2021-11-08
2,New York,112.6,106.7,109.0,113.4,111.8,2021-11-08
3,Utah,111.7,110.3,100.0,116.0,109.9,2021-11-08
4,Miami,111.7,107.0,118.0,110.8,112.8,2021-11-08


In [69]:
averages_against.head()

Unnamed: 0,Team,Def Season Ave,Def 3 Game Ave,Def Last Game Ave,Def Home Ave,Def Away Ave,Date
0,Denver,98.8,102.7,94.0,91.0,105.0,2021-11-08
1,Golden State,100.2,94.7,107.0,97.2,106.3,2021-11-08
2,Miami,100.2,106.7,115.0,98.8,102.0,2021-11-08
3,Toronto,101.8,106.0,116.0,105.6,95.2,2021-11-08
4,LA Clippers,102.6,101.7,106.0,99.6,106.2,2021-11-08


In [70]:
def add_one(num):
    return int(num)+1

In [87]:
def get_all_data():
    test = pd.merge(finals, averages_for, left_on=['Team_x', 'Date'], right_on=['Team', 'Date'], how='left').dropna()
    test = test.drop('Team', axis=1)
    test_2 = pd.merge(test, averages_for, left_on=['Team_y', 'Date'], right_on=['Team', 'Date'], how='left').dropna()
    test_2 = test_2.drop('Team', axis=1)
    test_3 = pd.merge(test_2, averages_against, left_on=['Team_x', 'Date'], right_on=['Team', 'Date'], how='left').dropna()
    test_3 = test_3.drop('Team', axis=1)
    test_4 = pd.merge(test_3, averages_against, left_on=['Team_y', 'Date'], right_on=['Team', 'Date'], how='left').dropna()
    return test_4.drop('Team', axis=1)

In [89]:
df = get_all_data()
df

Unnamed: 0,Team_x,Points_x,Team_y,Points_y,Date,Home,Winner,Season Ave_x,3 Game Ave_x,Last Game Ave_x,...,Def Season Ave_x,Def 3 Game Ave_x,Def Last Game Ave_x,Def Home Ave_x,Def Away Ave_x,Def Season Ave_y,Def 3 Game Ave_y,Def Last Game Ave_y,Def Home Ave_y,Def Away Ave_y
0,Brooklyn,95,Chicago,118,2021-11-08,Chicago,Chicago,106.1,109.7,116.0,...,103.3,100.3,103.0,100.7,107.2,102.7,110.3,114.0,102.2,103.2
1,New Orleans,92,Dallas,108,2021-11-08,Dallas,Dallas,101.4,94.7,85.0,...,112.7,116.7,126.0,113.8,112.0,106.1,112.3,104.0,106.6,105.5
2,Miami,96,Denver,113,2021-11-08,Denver,Denver,111.7,107.0,118.0,...,100.2,106.7,115.0,98.8,102.0,98.8,102.7,94.0,91.0,105.0
3,Atlanta,113,Golden State,127,2021-11-08,Golden State,Golden State,107.8,107.7,117.0,...,110.0,118.0,121.0,104.5,113.7,100.2,94.7,107.0,97.2,106.3
4,Charlotte,123,LA Lakers,126,2021-11-08,LA Lakers,LA Lakers,113.5,102.7,106.0,...,117.6,124.7,120.0,122.0,115.1,111.3,109.7,105.0,109.1,116.3
5,Minnesota,118,Memphis,125,2021-11-08,Memphis,Memphis,102.2,98.7,84.0,...,106.0,115.0,104.0,105.7,108.0,113.3,106.0,115.0,113.2,113.4
6,New York,103,Philadelphia,96,2021-11-08,Philadelphia,New York,112.6,106.7,109.0,...,110.7,111.7,126.0,116.4,105.0,102.6,100.3,105.0,102.2,103.0
7,Phoenix,109,Sacramento,104,2021-11-08,Sacramento,Phoenix,110.2,118.7,121.0,...,109.9,109.3,117.0,106.7,119.5,109.3,101.0,94.0,106.4,112.2
8,Portland,109,LA Clippers,117,2021-11-09,LA Clippers,LA Clippers,110.3,106.3,105.0,...,107.4,101.0,90.0,102.2,115.2,102.6,101.7,106.0,99.6,106.2
9,Milwaukee,118,Philadelphia,109,2021-11-09,Philadelphia,Milwaukee,106.7,103.0,94.0,...,108.6,101.0,101.0,107.8,109.4,102.6,102.0,103.0,102.3,103.0


In [103]:
def messy_wrangling():
    intermediate=pd.DataFrame()
    intermediate['Home?'] = df['Team_x'] + ', ' + df['Home']
    intermediate['Offensive Adv Season'] = df['Season Ave_x'] - df['Def Season Ave_y']
    intermediate['Offensive Adv 3G'] = df['3 Game Ave_x'] - df['Def 3 Game Ave_y']
    intermediate['Offensive Adv 1G'] = df['Last Game Ave_x'] - df['Def Last Game Ave_y']
    intermediate['Offensive Home Adv'] = df['Home Ave_x'] - df['Def Away Ave_x']
    intermediate['Offensive Away Adv'] = df['Away Ave_x'] - df['Def Home Ave_x']
    
    intermediate['Defensive Adv Season'] = df['Def Season Ave_x'] - df['Season Ave_y']
    intermediate['Defensive Adv 3G'] = df['Def 3 Game Ave_x'] - df['3 Game Ave_y']
    intermediate['Defensive Adv 1G'] = df['Def Last Game Ave_x'] - df['Last Game Ave_y']
    intermediate['Defensive Home Adv'] = df['Def Home Ave_x'] - df['Away Ave_x']
    intermediate['Defensive Away Adv'] = df['Def Away Ave_x'] - df['Home Ave_x']
    intermediate['Win?'] = df['Team_x'] + ', ' + df['Winner']
    
    intermediate['Home?'] = intermediate['Home?'].map(is_same)
    intermediate['Win?'] = intermediate['Win?'].map(is_same)
    return intermediate

In [104]:
messy_wrangling()

Unnamed: 0,ISHOME,Offensive Adv Season,Offensive Adv 3G,Offensive Adv 1G,Offensive Home Adv,Offensive Away Adv,Defensive Adv Season,Defensive Adv 3G,Defensive Adv 1G,Defensive Home Adv,Defensive Away Adv,WINNER?
0,"Brooklyn, Chicago",3.4,-0.6,2.0,-2.0,6.8,-4.6,-10.0,-2.0,-6.8,2.0,"Brooklyn, Chicago"
1,"New Orleans, Dallas",-4.7,-17.6,-19.0,-6.5,-15.1,10.9,8.0,19.0,15.1,6.5,"New Orleans, Dallas"
2,"Miami, Denver",12.9,4.3,24.0,8.8,14.0,-0.5,7.4,20.0,-14.0,-8.8,"Miami, Denver"
3,"Atlanta, Golden State",7.6,13.0,10.0,-0.9,0.0,-3.9,-2.0,1.0,0.0,0.9,"Atlanta, Golden State"
4,"Charlotte, LA Lakers",2.2,-7.0,1.0,6.7,-13.3,7.5,20.4,30.0,13.3,-6.7,"Charlotte, LA Lakers"
5,"Minnesota, Memphis",-11.1,-7.3,-31.0,-7.3,7.3,-2.2,14.7,17.0,-7.3,7.3,"Minnesota, Memphis"
6,"New York, Philadelphia",10.0,6.4,4.0,8.4,-4.6,-0.4,3.0,12.0,4.6,-8.4,"New York, New York"
7,"Phoenix, Sacramento",0.9,17.7,27.0,-9.2,3.3,-1.1,-5.0,26.0,-3.3,9.2,"Phoenix, Phoenix"
8,"Portland, LA Clippers",7.7,4.6,-1.0,1.0,-0.7,0.4,-15.7,-30.0,0.7,-1.0,"Portland, LA Clippers"
9,"Milwaukee, Philadelphia",4.1,1.0,-9.0,-5.2,1.4,-1.1,-5.3,5.0,-1.4,5.2,"Milwaukee, Milwaukee"


In [None]:
Index(['Team_x', 'Points_x', 'Team_y', 'Points_y', 'Date', 'Home', 'Winner',
       'Season Ave_x', '3 Game Ave_x', 'Last Game Ave_x', 'Home Ave_x',
       'Away Ave_x', 'Season Ave_y', '3 Game Ave_y', 'Last Game Ave_y',
       'Home Ave_y', 'Away Ave_y', 'Def Season Ave_x', 'Def 3 Game Ave_x',
       'Def Last Game Ave_x', 'Def Home Ave_x', 'Def Away Ave_x',
       'Def Season Ave_y', 'Def 3 Game Ave_y', 'Def Last Game Ave_y',
       'Def Home Ave_y', 'Def Away Ave_y'],
      dtype='object')