In [55]:
import urllib.request as urllib
from bs4 import BeautifulSoup
import pandas as pd
from random import randint
from time import sleep

# DATE VARIABLES MUST BE YYYY-MM-DD
STARTDATE = '2021-11-08'
ENDDATE = '2021-11-10' # up to, not including
CSV_TITLE = 'test_data_2020.csv'

# Put points into buckets of: 
BUCKET_BY = 5

#Date Stuff
DAYS_PER_MONTH = {'01': 31, '02': 28, '03': 31, '04': 30, '05': 31, '06': 30, '07': 31, '08': 31, '09': 30, 
                  '10': 31, '11': 30, '12': 31}
DAYS_PER_MONTH_leap = {'01': 31, '02': 29, '03': 31, '04': 30, '05': 31, '06': 30, '07': 31, '08': 31, '09': 30,
                       '10': 31,'11': 30, '12': 31}
LEAP_YEAR = ['2020']

In [56]:
teams = ['Atlanta', 'Boston', 'Brooklyn', 'Charlotte', 'Chicago', 'Cleveland', 'Dallas', 'Denver', 'Detroit', 
         'Golden State', 'Houston', 'Indiana', 'LA Clippers', 'LA Lakers', 'Memphis', 'Miami', 'Milwaukee', 
         'Minnesota', 'New Orleans', 'New York', 'Oklahoma City', 'Orlando', 'Philadelphia', 'Phoenix', 'Portland', 
         'Sacramento', 'San Antonio', 'Toronto', 'Utah', 'Washington']
ids = [x for x in range(1,31)]
TEAM_IDS = dict(zip(teams, ids))

In [57]:
def get_date_array():
    temp_date = STARTDATE
    dates = []
    while temp_date != ENDDATE:
        dates.append(temp_date)
        year, month, day = temp_date.split('-')
        # account for leap_years
        if year in LEAP_YEAR:
            days = DAYS_PER_MONTH_leap
        else:
            days = DAYS_PER_MONTH
        # move to next date
        if days[month] > int(day):
            temp_date = '%s-%s-%02d' % (year, month, int(day)+1)
        else:
            if int(month) >= 12: #==
                temp_date = '%s-01-01' % str(int(year)+1)
            else:
                temp_date = '%s-%02d-01' % (year, int(month)+1)
    return dates

In [58]:
def reformat_dates(date_array):
    ret = []
    for date in date_array:
        parts = date.split('-')
        ret.append('month=%s&day=%s&year=%s' % (parts[1], parts[2], parts[0]))
    return ret     

In [59]:
def get_finals():
    columns = ['Team', 'Points_x', 'Opp', 'Points_y', 'Date']
    dates = get_date_array()
    dates_for_url = reformat_dates(dates)
    ret = []
    url_finals = 'https://www.basketball-reference.com/boxscores/?'
    for i in range(len(dates_for_url)):
        print('Scraping for: %s' % dates[i])
        html = urllib.urlopen(url_finals+dates_for_url[i])
        soup = BeautifulSoup(html.read())
        html.close()
        scores = soup.find_all('table', class_='teams')
        for score in scores:
            table = score.get_text()
            team1 = table.split('\n')[3:5]
            team2 = table.split('\n')[10:12]
            temp = team1+team2
            temp2 = team2+team1
            temp.append(dates[i])
            ret.append(temp)
        sleep(randint(1,3))
    #print(ret)
    final = pd.DataFrame(ret, columns=columns)
    return final

In [60]:
finals = get_finals()
finals

Scraping for: 2021-11-08
Scraping for: 2021-11-09


Unnamed: 0,Team,Points_x,Opp,Points_y,Date
0,Brooklyn,95,Chicago,118,2021-11-08
1,New Orleans,92,Dallas,108,2021-11-08
2,Miami,96,Denver,113,2021-11-08
3,Atlanta,113,Golden State,127,2021-11-08
4,Charlotte,123,LA Lakers,126,2021-11-08
5,Minnesota,118,Memphis,125,2021-11-08
6,New York,103,Philadelphia,96,2021-11-08
7,Phoenix,109,Sacramento,104,2021-11-08
8,Portland,109,LA Clippers,117,2021-11-09
9,Milwaukee,118,Philadelphia,109,2021-11-09


In [43]:
def fix_okc(name):
    if name == 'Okla City':
        return 'Oklahoma City'
    return name

In [44]:
def scrape_data_averages_for():
   
    url = 'https://www.teamrankings.com/nba/stat/points-per-game?date='
    columns = ['Team', 'Season', '3 Game', 'Last Game', 'Home', 'Away', 'Last Season', 'Date']
    float_vals = ['Season', '3 Game', 'Last Game', 'Home', 'Away', 'Last Season']
    temp = []
    for date_str in get_date_array():
        if date_str in finals.Date.tolist():
            print('Scraping for: %s' % date_str)
            html = urllib.urlopen(url + date_str)
            soup = BeautifulSoup(html.read())
            html.close()
            rows = soup.find_all('tr')
            for row in rows[1:]:
                line=[]
                row = row.get_text()
                line = row.split('\n')
                for c in range(len(line)):
                    if line[c] == '--':
                        line[c] = '-99'
                line = line[2:len(line)-1]
                line.append(date_str)
                temp.append(line)
            sleep(randint(1,3))
    ret = pd.DataFrame(temp, columns = columns)
    ret[float_vals] = ret[float_vals].astype(float)
    ret['Team'] = ret['Team'].map(fix_okc)
    return ret.replace({-99.0: 'NaN'})

In [45]:
def scrape_data_averages_against():
   
    url = 'https://www.teamrankings.com/nba/stat/opponent-points-per-game?date='
    columns = ['Team', 'Opp Season', 'Opp 3 Game', 'Opp Last Game', 'Opp Home', 'Opp Away', 'Opp Last Season', 'Date']
    float_vals = ['Opp Season', 'Opp 3 Game', 'Opp Last Game', 'Opp Home', 'Opp Away', 'Opp Last Season']
    temp = []
    for date_str in get_date_array():
        if date_str in finals.Date.tolist():
            print('Scraping for: %s' % date_str)
            html = urllib.urlopen(url + date_str)
            soup = BeautifulSoup(html.read())
            html.close()
            rows = soup.find_all('tr')
            for row in rows[1:]:
                line=[]
                row = row.get_text()
                line = row.split('\n')
                for c in range(len(line)):
                    if line[c] == '--':
                        line[c] = '-99'
                line = line[2:len(line)-1]
                line.append(date_str)
                temp.append(line)
            sleep(randint(1,3))
    ret = pd.DataFrame(temp, columns = columns)
    ret[float_vals] = ret[float_vals].astype(float)
    ret['Team'] = ret['Team'].map(fix_okc)
    return ret.replace({-99.0: 'NaN'})

In [46]:
averages_for = scrape_data_averages_for()
averages_against = scrape_data_averages_against()

Scraping for: 2020-01-01
Scraping for: 2020-01-02
Scraping for: 2020-01-03
Scraping for: 2020-01-04
Scraping for: 2020-01-05
Scraping for: 2020-01-06
Scraping for: 2020-01-07
Scraping for: 2020-01-08
Scraping for: 2020-01-09
Scraping for: 2020-01-10
Scraping for: 2020-01-11
Scraping for: 2020-01-12
Scraping for: 2020-01-13
Scraping for: 2020-01-14
Scraping for: 2020-01-15
Scraping for: 2020-01-16
Scraping for: 2020-01-17
Scraping for: 2020-01-18
Scraping for: 2020-01-19
Scraping for: 2020-01-20
Scraping for: 2020-01-21
Scraping for: 2020-01-22
Scraping for: 2020-01-23
Scraping for: 2020-01-24
Scraping for: 2020-01-25
Scraping for: 2020-01-26
Scraping for: 2020-01-27
Scraping for: 2020-01-28
Scraping for: 2020-01-29
Scraping for: 2020-01-30
Scraping for: 2020-01-31
Scraping for: 2020-02-01
Scraping for: 2020-02-02
Scraping for: 2020-02-03
Scraping for: 2020-02-04
Scraping for: 2020-02-05
Scraping for: 2020-02-06
Scraping for: 2020-02-07
Scraping for: 2020-02-08
Scraping for: 2020-02-09


In [47]:
def add_one(num):
    return int(num)+1

# Combining Scraped Data
Good up to now, need to add Opp data into test data

In [48]:
def get_test_data():
    
    temp_test_data = pd.merge(finals, averages_for, on=['Team', 'Date'], how='left').dropna()
    test_data = pd.merge(temp_test_data, averages_against, left_on=['Opp', 'Date'], right_on=['Team', 'Date'], how='left')
   
    temp_test_data_2 = pd.merge(finals, averages_for, left_on=['Opp', 'Date'], right_on=['Team', 'Date'], how='left').dropna()
    temp_test_data_2 = temp_test_data_2[['Opp', 'Points_y', 'Team_x', 'Points_x', 'Date', 'Season', '3 Game', 
                                     'Last Game', 'Home', 'Away', 'Last Season']]
    temp_test_data_2 = temp_test_data_2.rename(columns={'Opp':'Team', 'Points_y': 'Points_x', 'Team_x': 'Opp', 
                                                    'Points_x': 'Points_y'})
    test_data_2 = pd.merge(temp_test_data_2, averages_against, left_on=['Opp', 'Date'], right_on=['Team', 'Date'], how='left')
    
    test_data = test_data.append(test_data_2)
    
    final = test_data[['Team_x', 'Opp', 'Date', 'Season', '3 Game', 'Last Game',
                      'Opp Season', 'Opp 3 Game', 'Opp Last Game', 'Points_x']]
    final = final.reset_index()
    final= final.rename({'index': 'Game_ID', 'Team_x': 'Team', 'Points_x': 'Points'}, axis=1)
    final['Game_ID'] = final['Game_ID'].map(add_one)
    final = final.sort_values('Game_ID')
    final = final.reset_index(drop=True)
    return final


In [49]:
test_df = get_test_data()
test_df

Unnamed: 0,Game_ID,Team,Opp,Date,Season,3 Game,Last Game,Opp Season,Opp 3 Game,Opp Last Game,Points
0,1,Phoenix,LA Lakers,2020-01-01,114.3,110.0,122.0,105.3,108.7,95.0,107
1,1,LA Lakers,Phoenix,2020-01-01,112.4,114.0,108.0,115.1,110.3,116.0,117
2,2,Milwaukee,Minnesota,2020-01-01,119.8,115.3,123.0,115.8,104.3,115.0,106
3,2,Minnesota,Milwaukee,2020-01-01,112.2,105.0,122.0,106.6,96.0,102.0,104
4,3,Portland,New York,2020-01-01,112.6,117.0,116.0,111.4,101.0,100.0,93
...,...,...,...,...,...,...,...,...,...,...,...
1409,705,New York,Toronto,2020-12-31,105.2,104.7,95.0,110.7,110.7,100.0,83
1410,706,Utah,Phoenix,2020-12-31,113.7,113.7,110.0,98.5,97.3,86.0,95
1411,706,Phoenix,Utah,2020-12-31,109.0,110.0,111.0,108.3,108.3,109.0,106
1412,707,Chicago,Washington,2020-12-31,113.2,116.3,115.0,119.5,121.7,115.0,133


In [50]:
def get_id(name):
    return TEAM_IDS[name]

In [51]:
def bucket_points(points):
    return BUCKET_BY * round(points/BUCKET_BY)

In [52]:
#### NOT FINISHED
def clean_up(df):
    float_vals = ['Season', '3 Game', 'Last Game', 'Opp Season', 'Opp 3 Game', 'Opp Last Game']
    
    # separate x and y
    df['Team_ID'], df['Opp_ID'] = df['Team'],df['Opp']
    df['Team_ID'] = df['Team_ID'].map(get_id)
    df['Opp_ID'] = df['Opp_ID'].map(get_id)

    df[float_vals] = df[float_vals].astype(float)
    df['Points'] = df['Points'].astype(int)
    #df['Points'] = df['Points'].map(bucket_points)
    
    return df
    

In [53]:
df = clean_up(test_df)
df

Unnamed: 0,Game_ID,Team,Opp,Date,Season,3 Game,Last Game,Opp Season,Opp 3 Game,Opp Last Game,Points,Team_ID,Opp_ID
0,1,Phoenix,LA Lakers,2020-01-01,114.3,110.0,122.0,105.3,108.7,95.0,107,24,14
1,1,LA Lakers,Phoenix,2020-01-01,112.4,114.0,108.0,115.1,110.3,116.0,117,14,24
2,2,Milwaukee,Minnesota,2020-01-01,119.8,115.3,123.0,115.8,104.3,115.0,106,17,18
3,2,Minnesota,Milwaukee,2020-01-01,112.2,105.0,122.0,106.6,96.0,102.0,104,18,17
4,3,Portland,New York,2020-01-01,112.6,117.0,116.0,111.4,101.0,100.0,93,25,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1409,705,New York,Toronto,2020-12-31,105.2,104.7,95.0,110.7,110.7,100.0,83,20,28
1410,706,Utah,Phoenix,2020-12-31,113.7,113.7,110.0,98.5,97.3,86.0,95,29,24
1411,706,Phoenix,Utah,2020-12-31,109.0,110.0,111.0,108.3,108.3,109.0,106,24,29
1412,707,Chicago,Washington,2020-12-31,113.2,116.3,115.0,119.5,121.7,115.0,133,5,30


In [54]:
df.to_csv(CSV_TITLE)