In [1]:
import numpy as np
import urllib.request
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
import lxml.html
from lxml import etree
import re
import time
import pandas as pd
from functools import reduce
from operator import itemgetter

In [2]:
def ConvertDataFrame(df):
    if 'team' in df.columns:
        cols = df.columns.drop(['lineup_name', 'team', 'code', 'sorted_code'])
    else:
        cols = df.columns.drop(['lineup_name', 'code', 'sorted_code'])
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
    df['year'] = df['year'].astype('int')
    df = df.drop_duplicates(subset=['code'], keep=False)
    return df

In [3]:
def OldFetchStatsTables(urls, years, col_list, cols_after_name, buttons):
    arr = []
    driver = webdriver.Chrome()
    driver.implicitly_wait(120)
    for i,url in enumerate(urls):
        year = years[i]
        driver.get(url)
        time.sleep(10)
        for button in buttons:
            print("Fetching lineup stats for the", button, "from the", year, "season...")
            sel = Select(driver.find_element_by_name('TeamID'))
            time.sleep(1)
            sel.select_by_visible_text(button)
            time.sleep(3)
            sel2 = Select(driver.find_element_by_class_name('stats-table-pagination__select'))
            time.sleep(1)
            sel2.select_by_visible_text("All")
            time.sleep(3)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            time.sleep(1)
            #(driver.page_source).encode('utf-8')
            results = driver.find_elements_by_xpath("//*[@class='nba-stat-table__overflow']//table/tbody/tr")
            time.sleep(1)

            counter = 0
            for result in results:
                item = result.text
                data = item.split()
                data[0:-cols_after_name] = [' '.join(data[0:-cols_after_name])]
                team = str(data[1])
                if float(data[2]) == 0. or float(data[3]) == 0.:
                    continue
                data = list(itemgetter(*col_list)(data))
                data[0] = data[0].replace('.', '')
                code_arr = [s.replace(' ', '') for s in data[0].split(',')]
                code = ''.join(code_arr) + team + str(year)
                data.insert(1, str(code))
                data.insert(2, int(year))
                arr.append(data)
                counter += 1
                #print(data)

            print("Fetched stats for", counter, "NBA player lineups.")
            time.sleep(1)
        
        print("Fetched lineup stats for all teams in the", year, "season.\n")
        time.sleep(1)
        
    driver.quit()
    time.sleep(1)
    #driver.quit()
    #print(arr)
    return np.array(arr)

In [4]:
def FetchStatsTables(urls, years, col_list, buttons):
    arr = []
    opt = FirefoxOptions()
    opt.add_argument("--headless")
    driver = webdriver.Firefox(options=opt)
    for i,url in enumerate(urls):
        year = years[i]
        driver.get(url)
        time.sleep(5)
        wait = WebDriverWait(driver, 60)
        for button in buttons:
            print("Fetching lineup stats for the", button, "from the", year, "season...")
            
            wait.until(EC.presence_of_element_located((By.XPATH, "//select[@name='TeamID']")))
            sel = Select(driver.find_element_by_name('TeamID'))
            time.sleep(1)
            sel.select_by_visible_text(button)
            time.sleep(1)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            wait.until(EC.presence_of_element_located((By.XPATH, "//select[contains(@class, 'stats-table-pagination__select')]")))
            sel2 = Select(driver.find_element_by_class_name('stats-table-pagination__select'))
            sel2.select_by_visible_text("All")
            time.sleep(1)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            
            root = lxml.html.fromstring(driver.page_source)
            results = root.xpath("//*[@class='nba-stat-table__overflow']//table/tbody/tr")
            
            counter = 0
            for result in results:
                item = result.xpath("./td//text()")
                item = [re.sub('\n +', '', x) for x in item]
                data = [x for x in item if x != '' and x != '\n']
                team = str(data[1])
                if float(data[2]) == 0. or float(data[3]) == 0.:
                    continue
                data = list(itemgetter(*col_list)(data))
                data = [s.strip('%') for s in data]
                data[0] = data[0].replace('.', '')
                #code_arr = [s.replace(' ', '') for s in data[0].split(',')]
                code_arr = [s.replace(' ', '') for s in data[0].split(', ')]
                sorted_code_arr = [s.replace(' ', '') for s in sorted(data[0].split(', '))]
                code = ''.join(code_arr) + team + str(year)
                sorted_code = ''.join(sorted_code_arr) + team + str(year)              
                data.insert(1, str(code))
                data.insert(2, str(sorted_code))
                data.insert(3, int(year))
                arr.append(data)
                counter += 1
                #print(data)

            print("Fetched stats for", counter, "NBA player lineups.")
            time.sleep(1)
        
        print("Fetched lineup stats for all teams in the", year, "season.\n")
        time.sleep(1)
        
    driver.quit()
    #print(arr)
    return np.array(arr)

In [5]:
# Establish the years for which we want to fetch lineup data
ya = [str(n).zfill(2) for n in range(16, 20)]
yb = [str(n).zfill(2) for n in range(17, 21)]
years = [int("20"+y) for y in yb]
#teams = ['Atlanta Hawks', 'Brooklyn Nets']
teams = ['Atlanta Hawks', 'Brooklyn Nets', 'Boston Celtics', 'Charlotte Hornets', 'Chicago Bulls', 'Cleveland Cavaliers', 'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons', 'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers', 'LA Clippers', 'Los Angeles Lakers', 'Memphis Grizzlies', 'Miami Heat', 'Milwaukee Bucks', 'Minnesota Timberwolves', 'New Orleans Pelicans', 'New York Knicks', 'Oklahoma City Thunder', 'Orlando Magic', 'Philadelphia 76ers', 'Phoenix Suns', 'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs', 'Toronto Raptors', 'Utah Jazz', 'Washington Wizards']


In [6]:
# Create URLs for the available years of NBA.com traditional/basic lineup data (per 36 minutes), 
# fetch the data in 2D array format, and put into a Pandas dataframe
#urls = [ "https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&PerMode=Per36" ]
urls = [ "https://stats.nba.com/lineups/traditional/?Season=20{0}-{1}&SeasonType=Regular%20Season&PerMode=Per36".format(ya[i], yb[i]) for i in range(len(ya)) ]
#print(urls)
np_arr_basic = FetchStatsTables(urls, years, [i for i in np.arange(25)], teams)


Fetching lineup stats for the Atlanta Hawks from the 2017 season...
Fetched stats for 479 NBA player lineups.
Fetching lineup stats for the Brooklyn Nets from the 2017 season...
Fetched stats for 564 NBA player lineups.
Fetching lineup stats for the Boston Celtics from the 2017 season...
Fetched stats for 431 NBA player lineups.
Fetching lineup stats for the Charlotte Hornets from the 2017 season...
Fetched stats for 259 NBA player lineups.
Fetching lineup stats for the Chicago Bulls from the 2017 season...
Fetched stats for 436 NBA player lineups.
Fetching lineup stats for the Cleveland Cavaliers from the 2017 season...
Fetched stats for 456 NBA player lineups.
Fetching lineup stats for the Dallas Mavericks from the 2017 season...
Fetched stats for 476 NBA player lineups.
Fetching lineup stats for the Denver Nuggets from the 2017 season...
Fetched stats for 404 NBA player lineups.
Fetching lineup stats for the Detroit Pistons from the 2017 season...
Fetched stats for 224 NBA player li

Fetching lineup stats for the LA Clippers from the 2019 season...
Fetched stats for 373 NBA player lineups.
Fetching lineup stats for the Los Angeles Lakers from the 2019 season...
Fetched stats for 595 NBA player lineups.
Fetching lineup stats for the Memphis Grizzlies from the 2019 season...
Fetched stats for 686 NBA player lineups.
Fetching lineup stats for the Miami Heat from the 2019 season...
Fetched stats for 449 NBA player lineups.
Fetching lineup stats for the Milwaukee Bucks from the 2019 season...
Fetched stats for 569 NBA player lineups.
Fetching lineup stats for the Minnesota Timberwolves from the 2019 season...
Fetched stats for 437 NBA player lineups.
Fetching lineup stats for the New Orleans Pelicans from the 2019 season...
Fetched stats for 589 NBA player lineups.
Fetching lineup stats for the New York Knicks from the 2019 season...
Fetched stats for 611 NBA player lineups.
Fetching lineup stats for the Oklahoma City Thunder from the 2019 season...
Fetched stats for 35

In [9]:
# Convert the dataframe to one with appropriate data types
df_basic = pd.DataFrame(np_arr_basic, columns=['lineup_name', 'code', 'sorted_code', 'year', 'team', 'GPT', 'MPT', 'PTST_PT', 'FGMT_PT', 'FGAT_PT', 'FGPT_PT', '3PMT_PT', '3PAT_PT', '3PPT_PT', 'FTMT_PT', 'FTAT_PT', 'FTPT_PT', 'ORBT_PT', 'DRBT_PT', 'TRBT_PT', 'ASTT_PT', 'TOVT_PT', 'STLT_PT', 'BLKT_PT', 'BLKAT_PT', 'PFT_PT', 'PFDT_PT', 'PMT_PT'])
df_basic = ConvertDataFrame(df_basic)
print(df_basic)


                                             lineup_name  \
0      D Howard, T Sefolosha, P Millsap, K Bazemore, ...   
1      D Howard, P Millsap, K Bazemore, D Schroder, T...   
2      K Korver, D Howard, P Millsap, K Bazemore, D S...   
3      D Howard, T Sefolosha, P Millsap, D Schroder, ...   
4      D Howard, E Ilyasova, D Schroder, T Hardaway J...   
...                                                  ...   
45199  D Bertans, J McRae, T Bryant, R Hachimura, I B...   
45200  I Mahinmi, D Bertans, I Thomas, J McRae, T Bro...   
45201  I Mahinmi, I Smith, B Beal, T Brown Jr, A Scho...   
45202  I Mahinmi, I Smith, D Bertans, T Brown Jr, R H...   
45203  I Smith, J McRae, G Payton II, T Brown Jr, I B...   

                                                    code  \
0      DHowardTSefoloshaPMillsapKBazemoreDSchroderATL...   
1      DHowardPMillsapKBazemoreDSchroderTHardawayJrAT...   
2        KKorverDHowardPMillsapKBazemoreDSchroderATL2017   
3      DHowardTSefoloshaPMillsapDSchrod

In [10]:
# Create URLs for the available years of NBA.com traditional/basic lineup data (per 100 possessions), 
# fetch the data in 2D array format, and put into a Pandas dataframe
urls = [ "https://stats.nba.com/lineups/traditional/?Season=20{0}-{1}&SeasonType=Regular%20Season&PerMode=Per100Possessions".format(ya[i], yb[i]) for i in range(len(ya)) ]
np_arr_basic2 = FetchStatsTables(urls, years, [i for i in np.arange(25) if i != 1 and i != 2], teams)


Fetching lineup stats for the Atlanta Hawks from the 2017 season...
Fetched stats for 511 NBA player lineups.
Fetching lineup stats for the Brooklyn Nets from the 2017 season...
Fetched stats for 613 NBA player lineups.
Fetching lineup stats for the Boston Celtics from the 2017 season...
Fetched stats for 456 NBA player lineups.
Fetching lineup stats for the Charlotte Hornets from the 2017 season...
Fetched stats for 271 NBA player lineups.
Fetching lineup stats for the Chicago Bulls from the 2017 season...
Fetched stats for 463 NBA player lineups.
Fetching lineup stats for the Cleveland Cavaliers from the 2017 season...
Fetched stats for 493 NBA player lineups.
Fetching lineup stats for the Dallas Mavericks from the 2017 season...
Fetched stats for 519 NBA player lineups.
Fetching lineup stats for the Denver Nuggets from the 2017 season...
Fetched stats for 430 NBA player lineups.
Fetching lineup stats for the Detroit Pistons from the 2017 season...
Fetched stats for 243 NBA player li

Fetching lineup stats for the LA Clippers from the 2019 season...
Fetched stats for 431 NBA player lineups.
Fetching lineup stats for the Los Angeles Lakers from the 2019 season...
Fetched stats for 630 NBA player lineups.
Fetching lineup stats for the Memphis Grizzlies from the 2019 season...
Fetched stats for 736 NBA player lineups.
Fetching lineup stats for the Miami Heat from the 2019 season...
Fetched stats for 479 NBA player lineups.
Fetching lineup stats for the Milwaukee Bucks from the 2019 season...
Fetched stats for 609 NBA player lineups.
Fetching lineup stats for the Minnesota Timberwolves from the 2019 season...
Fetched stats for 476 NBA player lineups.
Fetching lineup stats for the New Orleans Pelicans from the 2019 season...
Fetched stats for 635 NBA player lineups.
Fetching lineup stats for the New York Knicks from the 2019 season...
Fetched stats for 663 NBA player lineups.
Fetching lineup stats for the Oklahoma City Thunder from the 2019 season...
Fetched stats for 37

In [11]:
# Convert the dataframe to one with appropriate data types
df_basic2 = pd.DataFrame(np_arr_basic2, columns=['lineup_name', 'code', 'sorted_code', 'year', 'MPT_PH', 'PTST_PH', 'FGMT_PH', 'FGAT_PH', 'FGPT_PH', '3PMT_PH', '3PAT_PH', '3PPT_PH', 'FTMT_PH', 'FTAT_PH', 'FTPT_PH', 'ORBT_PH', 'DRBT_PH', 'TRBT_PH', 'ASTT_PH', 'TOVT_PH', 'STLT_PH', 'BLKT_PH', 'BLKAT_PH', 'PFT_PH', 'PFDT_PH', 'PMT_PH'])
df_basic2 = ConvertDataFrame(df_basic2)
print(df_basic2)


                                             lineup_name  \
0      D Howard, J Calderon, M Muscala, T Hardaway Jr...   
1      K Korver, M Scott, K Bazemore, M Muscala, T Pr...   
2      M Dunleavy, D Howard, P Millsap, K Bazemore, M...   
3      M Dunleavy, K Humphries, M Muscala, T Hardaway...   
4      K Korver, D Howard, T Sefolosha, P Millsap, K ...   
...                                                  ...   
48556  I Mahinmi, B Beal, G Payton II, T Brown Jr, I ...   
48557   I Smith, D Bertans, I Thomas, B Beal, T Brown Jr   
48558    C Miles, B Beal, T Bryant, R Hachimura, I Bonga   
48559  D Bertans, B Beal, M Wagner, R Hachimura, C Ch...   
48560  D Bertans, B Beal, A Pasecniks, T Brown Jr, I ...   

                                                    code  \
0      DHowardJCalderonMMuscalaTHardawayJrDBembryATL2017   
1           KKorverMScottKBazemoreMMuscalaTPrinceATL2017   
2       MDunleavyDHowardPMillsapKBazemoreMDelaneyATL2017   
3      MDunleavyKHumphriesMMuscalaTHard

In [13]:
# Create URLs for the available years of NBA.com advanced lineup data, 
# fetch the data in 2D array format, and put into a Pandas dataframe
#urls = [ "https://stats.nba.com/lineups/advanced/?Season=2018-19&SeasonType=Regular%20Season" ]
urls = [ "https://stats.nba.com/lineups/advanced/?Season=20{0}-{1}&SeasonType=Regular%20Season".format(ya[i], yb[i]) for i in range(len(ya)) ]
np_arr_adv = FetchStatsTables(urls, years, [0,4,5,6,7,8,9,10,11,12,13,14,15,16,17], teams)


Fetching lineup stats for the Atlanta Hawks from the 2017 season...
Fetched stats for 479 NBA player lineups.
Fetching lineup stats for the Brooklyn Nets from the 2017 season...
Fetched stats for 564 NBA player lineups.
Fetching lineup stats for the Boston Celtics from the 2017 season...
Fetched stats for 431 NBA player lineups.
Fetching lineup stats for the Charlotte Hornets from the 2017 season...
Fetched stats for 259 NBA player lineups.
Fetching lineup stats for the Chicago Bulls from the 2017 season...
Fetched stats for 436 NBA player lineups.
Fetching lineup stats for the Cleveland Cavaliers from the 2017 season...
Fetched stats for 456 NBA player lineups.
Fetching lineup stats for the Dallas Mavericks from the 2017 season...
Fetched stats for 476 NBA player lineups.
Fetching lineup stats for the Denver Nuggets from the 2017 season...
Fetched stats for 404 NBA player lineups.
Fetching lineup stats for the Detroit Pistons from the 2017 season...
Fetched stats for 224 NBA player li

Fetching lineup stats for the LA Clippers from the 2019 season...
Fetched stats for 373 NBA player lineups.
Fetching lineup stats for the Los Angeles Lakers from the 2019 season...
Fetched stats for 595 NBA player lineups.
Fetching lineup stats for the Memphis Grizzlies from the 2019 season...
Fetched stats for 686 NBA player lineups.
Fetching lineup stats for the Miami Heat from the 2019 season...
Fetched stats for 449 NBA player lineups.
Fetching lineup stats for the Milwaukee Bucks from the 2019 season...
Fetched stats for 569 NBA player lineups.
Fetching lineup stats for the Minnesota Timberwolves from the 2019 season...
Fetched stats for 437 NBA player lineups.
Fetching lineup stats for the New Orleans Pelicans from the 2019 season...
Fetched stats for 589 NBA player lineups.
Fetching lineup stats for the New York Knicks from the 2019 season...
Fetched stats for 611 NBA player lineups.
Fetching lineup stats for the Oklahoma City Thunder from the 2019 season...
Fetched stats for 35

In [14]:
# Convert the dataframe to one with appropriate data types
df_adv = pd.DataFrame(np_arr_adv, columns=['lineup_name', 'code', 'sorted_code', 'year', 'OFFRTGT', 'DEFRTGT', 'NETRTGT', 'ASTPT', 'ATRT', 'ASTRT', 'ORBRT', 'DRBRT', 'TRBRT', 'TORT', 'EFGPT', 'TST', 'PACET', 'PIET'])
df_adv = ConvertDataFrame(df_adv)
print(df_adv)


                                             lineup_name  \
0      D Howard, T Sefolosha, P Millsap, K Bazemore, ...   
1      D Howard, P Millsap, K Bazemore, D Schroder, T...   
2      K Korver, D Howard, P Millsap, K Bazemore, D S...   
3      D Howard, T Sefolosha, P Millsap, D Schroder, ...   
4      D Howard, E Ilyasova, D Schroder, T Hardaway J...   
...                                                  ...   
45199  I Thomas, J McRae, A Pasecniks, I Bonga, A Sch...   
45200  I Thomas, J McRae, A Pasecniks, I Bonga, G Mat...   
45201  I Thomas, J McRae, A Pasecniks, J Williams, G ...   
45202  I Thomas, J McRae, A Pasecniks, T Brown Jr, J ...   
45203  J McRae, A Pasecniks, T Brown Jr, A Schofield,...   

                                                    code  \
0      DHowardTSefoloshaPMillsapKBazemoreDSchroderATL...   
1      DHowardPMillsapKBazemoreDSchroderTHardawayJrAT...   
2        KKorverDHowardPMillsapKBazemoreDSchroderATL2017   
3      DHowardTSefoloshaPMillsapDSchrod

In [19]:
# Merge lineup dataframes on the sorted lineup code
#dfb = df_basic.drop([''], axis=1)
dfb = df_basic.copy()
dfb2 = df_basic2.drop(['lineup_name', 'code', 'year'], axis=1)
dfa = df_adv.drop(['lineup_name', 'code', 'year'], axis=1)
df = pd.merge(dfb, dfb2, on=['sorted_code'], how='inner')
df = pd.merge(df, dfa, on=['sorted_code'], how='inner')

#print(df.columns)

# Add any extra columns
df['2PAT_PT'] = df['FGAT_PT']-df['3PAT_PT']
df['2PMT_PT'] = df['FGMT_PT']-df['3PMT_PT']
df['2PPT_PT'] = df['2PMT_PT']/df['2PAT_PT']
df['2PAT_PH'] = df['FGAT_PH']-df['3PAT_PH']
df['2PMT_PH'] = df['FGMT_PH']-df['3PMT_PH']
df['2PPT_PH'] = df['2PMT_PH']/df['2PAT_PH']
df['2PPT_PT'].fillna(0, inplace=True)
df['2PPT_PH'].fillna(0, inplace=True)

# Add minutes-per-game played together for lineups
df['MPGT'] = df['MPT']/df['GPT']


                                             lineup_name  \
0      D Howard, T Sefolosha, P Millsap, K Bazemore, ...   
1      D Howard, P Millsap, K Bazemore, D Schroder, T...   
2      K Korver, D Howard, P Millsap, K Bazemore, D S...   
3      D Howard, T Sefolosha, P Millsap, D Schroder, ...   
4      D Howard, E Ilyasova, D Schroder, T Hardaway J...   
...                                                  ...   
45219  D Bertans, J McRae, T Bryant, R Hachimura, I B...   
45220  I Mahinmi, D Bertans, I Thomas, J McRae, T Bro...   
45221  I Mahinmi, I Smith, B Beal, T Brown Jr, A Scho...   
45222  I Mahinmi, I Smith, D Bertans, T Brown Jr, R H...   
45223  I Smith, J McRae, G Payton II, T Brown Jr, I B...   

                                                    code  \
0      DHowardTSefoloshaPMillsapKBazemoreDSchroderATL...   
1      DHowardPMillsapKBazemoreDSchroderTHardawayJrAT...   
2        KKorverDHowardPMillsapKBazemoreDSchroderATL2017   
3      DHowardTSefoloshaPMillsapDSchrod

In [20]:
df.to_csv("NBALineupStats.csv")
print(df)


                                             lineup_name  \
0      D Howard, T Sefolosha, P Millsap, K Bazemore, ...   
1      D Howard, P Millsap, K Bazemore, D Schroder, T...   
2      K Korver, D Howard, P Millsap, K Bazemore, D S...   
3      D Howard, T Sefolosha, P Millsap, D Schroder, ...   
4      D Howard, E Ilyasova, D Schroder, T Hardaway J...   
...                                                  ...   
45219  D Bertans, J McRae, T Bryant, R Hachimura, I B...   
45220  I Mahinmi, D Bertans, I Thomas, J McRae, T Bro...   
45221  I Mahinmi, I Smith, B Beal, T Brown Jr, A Scho...   
45222  I Mahinmi, I Smith, D Bertans, T Brown Jr, R H...   
45223  I Smith, J McRae, G Payton II, T Brown Jr, I B...   

                                                    code  \
0      DHowardTSefoloshaPMillsapKBazemoreDSchroderATL...   
1      DHowardPMillsapKBazemoreDSchroderTHardawayJrAT...   
2        KKorverDHowardPMillsapKBazemoreDSchroderATL2017   
3      DHowardTSefoloshaPMillsapDSchrod