In [57]:
# Import Libraries.

import os
import requests
import time
import datetime
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

"""Maybe make it into a job that extracts and sends a completion email every year."""
# import smtplib 



'Maybe make it into a job that extracts and sends a completion email every year.'

## Extracting Data on NBA Draft from 2019-2021.

In [58]:
# Creating a list of draft years.

current_year = datetime.date.today().year
draft_years = [year for year in range(1989, (current_year)+1)]
columns = ['Round', 'Pick', 'Player', 'Position', 'Nationality', 'Team', 'School/club team']
columns

['Round',
 'Pick',
 'Player',
 'Position',
 'Nationality',
 'Team',
 'School/club team']

Function to create new columns:

In [59]:
def create_new_columns(df):
    
    def create_traded_to_a_different_team(x):
        if re.findall("from", str(x)) or re.findall("traded", str(x)) or re.findall("via", str(x)):
            y = "Yes"
        else:
            y = "No"  
        return y

    def create_status(x):
        if re.findall("Fr.", str(x)):
            y = "College Freshman"
        elif re.findall("So.", str(x)):
            y = "College Sophmore"
        elif re.findall("Jr.", str(x)):
            y = "College Junior"
        elif re.findall("Sr.", str(x)):
            y = "College Senior"
        elif re.findall("G League", str(x)):
            y ="NBA G League Player"
        else:
            y = "Playing Internationally" 
        return y


    df['Traded to a different team'] = df[columns[5]].apply(create_traded_to_a_different_team)
    df['Status'] = df[columns[6]].apply(create_status)
    
    return df

Function to clean the data:

In [60]:
# Function to clean the data.

def clean_data(df):
    
    def clean_player(x):
        x = x.rstrip('*~#+').split('[')[0]
        return x

    def clean_nationality(x):
        x = x.split('[')[0].split('\xa0')[0]
        return x

    def clean_team(x):
        x = x.split('[')[0].split('(')[0].rstrip(' ')
        return x

    def clean_school_club_team(x):
        x = x.split('[')[0]
        if len(re.findall("Fr.", x))!=0 or len(re.findall("So.", x))!=0 or len(re.findall("Jr.", x))!=0 or len(re.findall("Sr.", x))!=0 or len(re.findall("G League", x))!=0:
            x = x.split('(')[0].rstrip(' ')
        return x
    
    
    df.fillna('', inplace=True)

    df[columns[2]] = df[columns[2]].apply(clean_player)
    df[columns[4]] = df[columns[4]].apply(clean_nationality)
    df[columns[5]] = df[columns[5]].apply(clean_team)
    df[columns[6]] = df[columns[6]].apply(clean_school_club_team)
    
    df[columns[0]] = df[columns[0]].replace([''], -1)
    df[columns[1]] = df[columns[1]].replace([''], -1)

    
    df[columns[0]] = df[columns[0]].astype(int, errors='ignore')
    df[columns[1]] = df[columns[1]].astype(int, errors='ignore')
    
    return df

In [61]:
# Creating Dataset Directory

# Directory
directory = "csv"
  
# Parent Directory path
parent_dir = "../"
  
# Path
path = os.path.join(parent_dir, directory)
  
# Create the directory
try:
    os.mkdir(path)
    print("Dataset Directory '% s' created" % directory)
except FileExistsError:
    print("Dataset Directory '% s' already exists." % directory)

Dataset Directory 'csv' already exists.


In [65]:
# Connect to WIKI Page, load data into pandas dataframe and save as .csv file.

dataframe = pd.DataFrame()

for draft_year in draft_years:
    URL = 'https://en.wikipedia.org/wiki/'+str(draft_year)+'_NBA_draft'

    # headers = {}

    wiki_page = requests.get(URL, timeout=5, verify=True)
    df = pd.read_html(wiki_page.text)
    if draft_year != 2022:
        df = df[3]
    else:
        df = df[1]
    df.columns = columns
    create_new_columns(df)
    clean_data(df)
    df.to_csv('../csv/nba_draft_'+'{}'.format(draft_year)+'.csv', index=False)
    df['Draft year'] = draft_year
    dataframe = dataframe.append(df)
    print('NBA Draft '+'{}'.format(draft_year)+' data Successfully loaded and transformed...')

dataframe.to_csv('../csv/allNBADrafts.csv', index=False)
print('Extraction and Transformation Complete!')

NBA Draft 1989 data Successfully loaded and transformed...
NBA Draft 1990 data Successfully loaded and transformed...
NBA Draft 1991 data Successfully loaded and transformed...
NBA Draft 1992 data Successfully loaded and transformed...
NBA Draft 1993 data Successfully loaded and transformed...
NBA Draft 1994 data Successfully loaded and transformed...
NBA Draft 1995 data Successfully loaded and transformed...
NBA Draft 1996 data Successfully loaded and transformed...
NBA Draft 1997 data Successfully loaded and transformed...
NBA Draft 1998 data Successfully loaded and transformed...
NBA Draft 1999 data Successfully loaded and transformed...
NBA Draft 2000 data Successfully loaded and transformed...
NBA Draft 2001 data Successfully loaded and transformed...
NBA Draft 2002 data Successfully loaded and transformed...
NBA Draft 2003 data Successfully loaded and transformed...
NBA Draft 2004 data Successfully loaded and transformed...
NBA Draft 2005 data Successfully loaded and transformed.