In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timezone, timedelta
import boto3
import pyarrow
import awswrangler as wr


In [2]:
# NBA season we will be analyzing
year = 2021
# URL page we will scraping (see image above)
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html)

In [3]:
url
# url is literlly just the url link

'https://www.basketball-reference.com/leagues/NBA_2021_per_game.html'

In [4]:
html
# this is the httpresponse code we get after opening the url

<http.client.HTTPResponse at 0x1d125744c10>

In [6]:
# soup
# soup is the LITERAL HTML.  ITS HUNDREDS OF LINES LONG, THOUSANDS

# we need to grab just the elements we want.

In [5]:
# use findALL() to get the column headers
# soup.findAll('tr', limit=2)
# use getText()to extract the text we need into a list

# the tr group has th elements which are the headers we want data for.
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]
headers

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [6]:
# avoid the first header row
# the td elements have the actual data points we want.
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [7]:
# combining the column headers and the data points together
stats = pd.DataFrame(player_stats, columns = headers)
stats['PTS'] = pd.to_numeric(stats['PTS'])

In [8]:
stats.sort_values('PTS', ascending = False).head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
156,Stephen Curry,PG,32,GSW,63,63,34.2,10.4,21.7,0.482,...,0.916,0.5,5.0,5.5,5.8,1.2,0.1,3.4,1.9,32.0
46,Bradley Beal,SG,27,WAS,60,60,35.8,11.2,23.0,0.485,...,0.889,1.2,3.5,4.7,4.4,1.2,0.4,3.1,2.3,31.3
410,Damian Lillard,PG,30,POR,67,67,35.8,9.0,19.9,0.451,...,0.928,0.5,3.7,4.2,7.5,0.9,0.3,3.0,1.5,28.8
196,Joel Embiid,C,26,PHI,51,51,31.1,9.0,17.6,0.513,...,0.859,2.2,8.4,10.6,2.8,1.0,1.4,3.1,2.4,28.5
17,Giannis Antetokounmpo,PF,26,MIL,61,61,33.0,10.3,18.0,0.569,...,0.685,1.6,9.4,11.0,5.9,1.2,1.2,3.4,2.8,28.1
177,Luka Dončić,PG,21,DAL,66,66,34.3,9.8,20.5,0.479,...,0.73,0.8,7.2,8.0,8.6,1.0,0.5,4.3,2.3,27.7
395,Zach LaVine,SG,25,CHI,58,58,35.1,9.8,19.4,0.507,...,0.849,0.6,4.4,5.0,4.9,0.8,0.5,3.5,2.4,27.4
714,Zion Williamson,PF,20,NOP,61,61,33.2,10.4,17.0,0.611,...,0.698,2.7,4.5,7.2,3.7,0.9,0.6,2.7,2.2,27.0
189,Kevin Durant,PF,32,BRK,35,32,33.1,9.3,17.2,0.537,...,0.882,0.4,6.7,7.1,5.6,0.7,1.3,3.4,2.0,26.9
328,Kyrie Irving,PG,28,BRK,54,54,34.9,10.2,20.1,0.506,...,0.922,1.0,3.8,4.8,6.0,1.4,0.7,2.4,2.6,26.9


In [9]:
# datetimte stuff
today = datetime.now().date()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year

In [10]:
#### BOX SCORE WEB SCRAPING
# https://www.basketball-reference.com/friv/dailyleaders.fcgi?month=07&day=17&year=2021&type=all

url = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={}&day={}&year={}&type=all".format(month, day, year)
html = urlopen(url)
soup = BeautifulSoup(html)

In [11]:
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
headers = headers[1:]
headers[2] = "Location"
headers[4] = "Outcome"
# headers

IndexError: list index out of range

In [41]:
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [42]:
df2 = pd.DataFrame(player_stats, columns = headers)

In [52]:
df2[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']] = df2[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']].apply(pd.to_numeric)

In [55]:
df2.sort_values('PTS', ascending = False)

Unnamed: 0,Player,Tm,Location,Opp,Outcome,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,GmSc
2,Devin Booker,PHO,,MIL,L,41:32,17,33,0.515,2,...,3,4,3,2,1,2,5,40,12,25.7
1,Giannis Antetokounmpo,MIL,@,PHO,W,40:34,14,23,0.609,0,...,4,9,6,0,0,0,2,32,-7,26.8
5,Khris Middleton,MIL,@,PHO,W,44:23,12,23,0.522,3,...,7,7,5,0,0,4,3,29,4,17.7
0,Jrue Holiday,MIL,@,PHO,W,41:57,12,20,0.6,3,...,3,4,13,3,1,2,4,27,14,28.6
3,Chris Paul,PHO,,MIL,L,35:21,9,15,0.6,3,...,2,2,11,0,1,1,5,21,-6,20.1
4,Deandre Ayton,PHO,,MIL,L,44:49,7,12,0.583,0,...,7,10,1,1,2,2,4,20,6,18.1
6,Pat Connaughton,MIL,@,PHO,W,33:23,4,6,0.667,4,...,5,6,0,0,0,0,0,14,10,13.6
7,Mikal Bridges,PHO,,MIL,L,32:44,5,6,0.833,3,...,4,4,1,2,1,2,1,13,5,13.0
8,Jae Crowder,PHO,,MIL,L,40:09,4,7,0.571,2,...,4,5,3,2,0,0,3,10,-7,11.5
9,Bobby Portis,MIL,@,PHO,W,19:20,3,6,0.5,2,...,0,3,0,2,0,0,0,9,7,10.1


In [15]:
#### injury report
url = "https://www.basketball-reference.com/friv/injuries.fcgi"
html = urlopen(url)
soup = BeautifulSoup(html)

In [16]:
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

trs = soup.findAll('tr')[1:]
rows = []
for tr in trs:
    player_name = tr.find('a').text
    data = [player_name] + [x.text for x in tr.find_all('td')]
    rows.append(data)

injury_data = pd.DataFrame(rows, columns = headers)

In [5]:
injury_data

Unnamed: 0,Player,Team,Update,Description
0,Onyeka Okongwu,Atlanta Hawks,"Wed, Jul 21, 2021",Out (Shoulder) - The Hawks announced that Okon...
1,Jaylen Brown,Boston Celtics,"Thu, May 13, 2021",Out (Wrist) - The Celtics announced that Brown...
2,Coby White,Chicago Bulls,"Thu, Jun 10, 2021",Out (Shoulder) - The Bulls announced that Whit...
3,Jamal Murray,Denver Nuggets,"Thu, Jul 22, 2021",Out (Knee) - Murray is recovering from a torn ...
4,Klay Thompson,Golden State Warriors,"Thu, Jul 22, 2021",Out (Right Achilles) - Thompson is on track to...
5,James Wiseman,Golden State Warriors,"Thu, Jul 22, 2021",Out (Knee) - Wiseman is on track to be ready b...
6,T.J. Warren,Indiana Pacers,"Thu, Mar 25, 2021",Out (Foot) - Warren underwent foot surgery and...
7,Donte DiVincenzo,Milwaukee Bucks,"Fri, May 28, 2021",Out (Foot) - DiVincenzo suffered a tendon inju...
8,Jarrett Culver,Minnesota Timberwolves,"Thu, Apr 29, 2021",Out (Ankle) - The Timberwolves announced Culve...
9,Taurean Prince,Minnesota Timberwolves,"Thu, Apr 22, 2021",Out (Ankle) - The Cavaliers announced F Taurea...


In [8]:
# ALTERNATIVE
injury_data2 = pd.read_html(url)[0]

In [9]:
injury_data2

Unnamed: 0,Player,Team,Update,Description
0,Onyeka Okongwu,Atlanta Hawks,"Wed, Jul 21, 2021",Out (Shoulder) - The Hawks announced that Okon...
1,Jaylen Brown,Boston Celtics,"Thu, May 13, 2021",Out (Wrist) - The Celtics announced that Brown...
2,Coby White,Chicago Bulls,"Thu, Jun 10, 2021",Out (Shoulder) - The Bulls announced that Whit...
3,Jamal Murray,Denver Nuggets,"Thu, Jul 22, 2021",Out (Knee) - Murray is recovering from a torn ...
4,Klay Thompson,Golden State Warriors,"Thu, Jul 22, 2021",Out (Right Achilles) - Thompson is on track to...
5,James Wiseman,Golden State Warriors,"Thu, Jul 22, 2021",Out (Knee) - Wiseman is on track to be ready b...
6,T.J. Warren,Indiana Pacers,"Thu, Mar 25, 2021",Out (Foot) - Warren underwent foot surgery and...
7,Donte DiVincenzo,Milwaukee Bucks,"Fri, May 28, 2021",Out (Foot) - DiVincenzo suffered a tendon inju...
8,Jarrett Culver,Minnesota Timberwolves,"Thu, Apr 29, 2021",Out (Ankle) - The Timberwolves announced Culve...
9,Taurean Prince,Minnesota Timberwolves,"Thu, Apr 22, 2021",Out (Ankle) - The Cavaliers announced F Taurea...


In [18]:
##### transactions
url = "https://www.basketball-reference.com/leagues/NBA_2021_transactions.html"
html = urlopen(url)
soup = BeautifulSoup(html)

In [None]:
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

trs = soup.findAll('tr')[1:]
rows = []
for tr in trs:
    player_name = tr.find('a').text
    data = [player_name] + [x.text for x in tr.find_all('td')]
    rows.append(data)

injury_data = pd.DataFrame(rows, columns = headers)

In [72]:
#for tr in trs:
    #data = tr.findAll('p')
    # for p in data:
        # print(p.text)
    # print(tr.find('span').text)
    # print(x.text for x in tr.find('p'))

In [74]:
trs = soup.findAll('li')[71:]
rows = []
for tr in trs:
    date = tr.find('span').text
    data = tr.find('p')
    data2 = [date] + [data]
    rows.append(data2)

AttributeError: 'NoneType' object has no attribute 'text'

In [154]:
trs = soup.findAll('li')[71:]
rows = []
mylist = []
for tr in trs:
    date = tr.find('span').text
    data = tr.findAll('p')
    for p in data:
        if p is not None:
            mylist.append(p.text)
    data3 = [date] + [mylist]
    rows.append(data3)
    mylist = []

AttributeError: 'NoneType' object has no attribute 'text'

In [160]:
trs = soup.findAll('li')[71:]
rows = []
mylist = []
for tr in trs:
    date = tr.find('span')
    if date is not None: # needed bc span can be null (multi <p> elements per span)
        date = date.text
    data = tr.findAll('p')
    for p in data:
        mylist.append(p.text)
    data3 = [date] + [mylist]
    rows.append(data3)
    mylist = []

In [159]:
transactions = pd.DataFrame(rows)
transactions.columns = ['Date', 'Transaction']
transactions = transactions.explode('Transaction')
transactions['Date'] = pd.to_datetime(transactions['Date'])
transactions = transactions.query('Date != "NaN"')
transactions

Unnamed: 0,Date,Transaction
0,2021-06-28,The Dallas Mavericks hired Nico Harrison as GM.
0,2021-06-28,The Dallas Mavericks hired Jason Kidd as Head ...
1,2021-06-18,"The Boston Celtics traded Kemba Walker, a 2021..."
2,2021-06-17,Rick Carlisle resigns as Head Coach for Dallas...
3,2021-06-16,The Dallas Mavericks fired Donn Nelson as GM.
...,...,...
135,2020-11-18,The Orlando Magic traded a 2020 2nd round draf...
136,2020-11-17,The Utah Jazz traded Ante Tomic and a 2020 1st...
137,2020-11-16,The Orlando Magic signed Chuma Okeke to a mult...
137,2020-11-16,The Oklahoma City Thunder traded Abdel Nader a...


In [30]:
# random web scrape i found - might be useful
import csv 
import requests
from bs4 import BeautifulSoup
import csv
import re
url_list = ['https://basketball.realgm.com/player/player/Summary/2',
            'https://basketball.realgm.com/player/player/Summary/1']

for url in url_list:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    player = soup.find_all('div', class_='wrapper clearfix container')[0]

    playerprofile = re.sub(
        r'\n\s*\n', r'\n', player.get_text().strip(), flags=re.M)

    output = playerprofile + "\n"


In [66]:
### SCHEDULE
raw_df = pd.DataFrame()
month_list = ['december', 'january', 'february', 'march', 'april', 'may', 'june', 'july']
url = "https://www.basketball-reference.com/leagues/NBA_2021_games-december.html"
html = urlopen(url)
soup = BeautifulSoup(html)

In [59]:
headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
# headers = headers[1:]
headers[6] = 'boxScoreLink'
headers[7] = 'isOT'
headers = headers[1:]

In [60]:
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [61]:
rows = soup.findAll('tr')[1:]
date_info = [[th.getText() for th in rows[i].findAll('th')]
            for i in range(len(rows))]

game_info = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
date_info = [i[0] for i in date_info] # removes brackets from each element.
# date_info

In [62]:
schedule = pd.DataFrame(game_info, columns = headers)
schedule['Date'] = date_info

In [92]:
# variables in functions are local by default
# variables outside of functions are global by default
# to modify a global variable in local function, we have to explicity label it as a global var.
schedule_df = pd.DataFrame()
def schedule_scraper(month):
    global schedule_df
    url = "https://www.basketball-reference.com/leagues/NBA_2021_games-{}.html".format(month)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]

    headers[6] = 'boxScoreLink'
    headers[7] = 'isOT'
    headers = headers[1:]

    rows = soup.findAll('tr')[1:]
    date_info = [[th.getText() for th in rows[i].findAll('th')]
            for i in range(len(rows))]

    game_info = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    date_info = [i[0] for i in date_info]

    schedule = pd.DataFrame(game_info, columns = headers)
    schedule['Date'] = date_info
    
    # join_df = join_df.append(schedule)
    schedule_df = schedule_df.append(schedule)
    # return(join_df)

In [93]:
schedule_df = pd.DataFrame()
schedule_scraper('february')

In [86]:
schedule_df = pd.DataFrame()
for month in month_list:
    schedule_scraper(month)

In [91]:
raw_df.tail(5)

Unnamed: 0,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,boxScoreLink,isOT,Attend.,Notes,Date
3,9:00p,Milwaukee Bucks,108,Phoenix Suns,118,Box Score,,16583,,"Thu, Jul 8, 2021"
4,8:00p,Phoenix Suns,100,Milwaukee Bucks,120,Box Score,,16637,,"Sun, Jul 11, 2021"
5,9:00p,Phoenix Suns,103,Milwaukee Bucks,109,Box Score,,16911,,"Wed, Jul 14, 2021"
6,9:00p,Milwaukee Bucks,123,Phoenix Suns,119,Box Score,,16562,,"Sat, Jul 17, 2021"
7,9:00p,Phoenix Suns,98,Milwaukee Bucks,105,Box Score,,17397,,"Tue, Jul 20, 2021"


In [102]:
print(f"hi it is {month} haa")
aab = f'hi it is {month} haa'
print(aab)
# f('hii {var1} haa' functionality works for both print statements and variable dec

hi it is july haa
hi it is july haa


In [104]:
print('hi it is {} month') #.format(month) will give an error
aac = 'hi it is {} haa'.format(month)
print(aab)
# {} .format only works for variable dec

hi it is {} month
hi it is july haa


In [5]:
#### Team advanced stats
url = "https://www.basketball-reference.com/leagues/NBA_2021.html"
html = urlopen(url)
soup = BeautifulSoup(html)

In [49]:
# NOT NEEDED
divTag = soup.find("div", {"id": "div_advanced-team"})
th_all = divTag.find_all('th')
result = []
for th in th_all:
    result.extend(th.find_all(text='A'))

bby = divTag.select("th td")

In [6]:
df_list = pd.read_html(url)


In [7]:
advanced_stats = pd.DataFrame(df_list[10])
advanced_stats.drop(columns=advanced_stats.columns[0], 
        axis=1, 
        inplace=True)

advanced_stats.columns = ['Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORTG', 'DRTG', 'NRTG', 'Pace', 'FTr', '3PAr', 'TS%', 'bby1', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'bby2', 'eFG%_opp', 'TOV%_opp', 'DRB%_opp', 'FT/FGA_opp', 'bby3', 'Arena', 'Attendance', 'Att/Game']
advanced_stats.drop(['bby1', 'bby2', 'bby3'], axis = 1, inplace = True)
# advanced_stats.head(5)

In [125]:
# SQL STUFF
import sqlalchemy
from mysql.connector import Error
import os

def create_connection():
    connection = None
    try:
        connection = sqlalchemy.create_engine('mysql+mysqlconnector://' + os.environ.get('USER') + ':' + os.environ.get('PW') + '@' + os.environ.get('IP') + ':' + os.environ.get('PORT') + '/' + os.environ.get('DB'),
                     echo = False)
        print(f"Connection to Jacob's RDS MySQL {os.environ.get('DB')} DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

# connection = create_connection("localhost", "root", "")

In [126]:
connection = create_connection()

Connection to Jacob's RDS MySQL aws_database DB successful


In [111]:
# unnecessary for sqlalchemy, only used for mysql.connector + cursor connections
def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        print("Query executed successfully")
    except Error as e:
        print(f"The error '{e}' occurred")

In [127]:
all_tables = pd.read_sql_query('SHOW TABLES FROM aws_database;', connection)
all_tables

Unnamed: 0,Tables_in_aws_database
0,aws_adv_player_stats
1,aws_cleaned_pbp
2,aws_gamelogs
3,aws_injuries
4,aws_my_cron_date
5,aws_new_odds
6,aws_odds_df
7,aws_opponent_shooting
8,aws_raw_pbp
9,aws_raw_pbp_yesterday


In [128]:
odds = pd.read_sql_query('SELECT * FROM aws_new_odds;', connection)
odds

Unnamed: 0,row_names,team,team_acc,opp,moneyline,spread,overunder,team_pts,game_id,date
0,1,Atlanta Hawks,ATL,PHI,225,6.5,over,108.0,1,2021-06-08
1,2,Philadelphia 76ers,PHI,ATL,-278,-6.5,under,113.0,1,2021-06-08
2,3,Los Angeles Clippers,LAC,UTA,128,3.0,over,112.0,2,2021-06-08
3,4,Utah Jazz,UTA,LAC,-155,-3.0,under,109.0,2,2021-06-08
4,1,Denver Nuggets,DEN,PHX,170,5.0,over,113.0,1,2021-06-09
...,...,...,...,...,...,...,...,...,...,...
95,2,Milwaukee Bucks,MIL,PHX,-177,-4.0,under,109.0,1,2021-07-09
96,1,Phoenix Suns,PHX,MIL,145,4.0,over,112.0,1,2021-07-12
97,2,Milwaukee Bucks,MIL,PHX,-177,-4.0,under,109.0,1,2021-07-12
98,1,Phoenix Suns,,Milwaukee Bucks,175,5.0,over,110.0,1,2021-07-18


In [129]:
odds.to_sql(con = connection, name = "my_python_table_2", index = False, if_exists = "replace")

In [130]:
odds2 = pd.read_sql_query('SELECT * FROM my_python_table_2;', con = connection)

In [143]:
time_now = pd.Series(datetime.now())
time_df = pd.DataFrame()
time_df = time_df.append(time_now, ignore_index = True)
time_df.to_sql(con = connection, name = "my_python_time_table", index = False, if_exists = "append")

In [141]:
time_df.to_sql(con = connection, name = "my_python_time_table", index = False, if_exists = "replace")

In [144]:
odds_sql = pd.read_sql_query('SELECT * FROM my_python_time_table;', connection)
odds_sql

In [None]:
# append rows of your local dataframe to an EXISTING sql table
time_df.to_sql(con = connection, name = "my_python_time_table",
               index = False, if_exists = "append")

# overwrite your EXISTING sql table with the new dataframe.
time_df.to_sql(con = connection, name = "my_python_time_table",
               index = False, if_exists = "replace")

In [3]:
# S3 STUFF
s3 = boto3.resource(
    service_name ='s3',
    region_name = os.getenv('AWS_REGION'),
    aws_access_key_id = os.getenv('AWS_KEY'),
    aws_secret_access_key = os.getenv('AWS_SECRET')
)

In [4]:
for bucket in s3.buckets.all():
    print(bucket.name)

mygamelogsbucket
thebucketofjacob2020


In [None]:
advanced_stats.to_parquet('advanced_stats.parquet')

# uploading a file to the bucket mygamelogsbucket
s3.Bucket('mygamelogsbucket').upload_file(Filename = 'foo.parquet', Key = 'you_dont_need_extension')

In [9]:
# works but spits out error.
wr.s3.to_parquet(
    df = advanced_stats,
    path = "s3://mygamelogsbucket/my-advanced-stats4.parquet"
    #path2 = "s3://mygamelogsbucket/key/my-advanced-stats3" will make a folder called key
    # and put the my-advanced-stats3 file in the key folder.
)

{'paths': ['s3://mygamelogsbucket/my-advanced-stats4.parquet'],
 'partitions_values': {}}

In [10]:
gamelogs_s3 = s3.Bucket('mygamelogsbucket').Object('bby.csv').get()
gamelogs_s3 = pd.read_csv(gamelogs_s3['Body'], index_col = 0)


In [99]:
gamelogs_s3.tail(5)

Unnamed: 0_level_0,Season,Date,GameID,TeamGamesPlayed,Team,isB2B,isB2BFirst,isB2BSecond,Location,DaysRest,...,DREB,TRB,AST,STL,BLK,TOV,PF,PTS,PlusMinus,Type
row_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22,2021,2021-05-20,52000201,2,WAS,,,,H,1,...,3,3,0,2,0,1,2,14,17,Play-In Tournament
23,2021,2021-05-20,52000201,2,WAS,,,,H,1,...,7,7,2,1,1,1,1,4,11,Play-In Tournament
24,2021,2021-05-20,52000201,2,IND,,,,A,1,...,2,3,3,1,1,1,2,6,-14,Play-In Tournament
25,2021,2021-05-20,52000201,2,WAS,,,,H,1,...,5,5,4,0,1,4,0,25,18,Play-In Tournament
26,2021,2021-05-20,52000201,2,WAS,,,,H,1,...,1,3,0,0,1,0,1,9,7,Play-In Tournament


In [11]:
# works but spits out error.
wr.s3.to_parquet(
    df = gamelogs_s3,
    path = "s3://mygamelogsbucket/bby.parquet"
    #path2 = "s3://mygamelogsbucket/key/my-advanced-stats3" will make a folder called key
    # and put the my-advanced-stats3 file in the key folder.
)

{'paths': ['s3://mygamelogsbucket/bby.parquet'], 'partitions_values': {}}

In [19]:
s3_df_parquet = wr.s3.read_parquet("s3://mygamelogsbucket/bby.parquet")
# s3_df_parquet2 = pd.read_parquet("s3://mygamelogsbucket/bby.parquet")
# need different permissions to access this.

ImportError: Install s3fs to access S3