### Import Modules and Settings

In [1]:
import stats_nba_scraper as nba

import os
import path

import requests
requests.__path__
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from datetime import datetime, timedelta
import string
import re
from functools import reduce
import time
import random
 
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.set_option("display.precision", 3)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

current_season_path = os.getcwd() + '/data/2019-20_raw/'
raw_path = os.getcwd() + '/data/raw/'
fc_path = os.getcwd() + '/data/external/' #fantasycruncher csv files
fc_pickled_path = os.getcwd() + '/data/external/pickled_version/' #pickled versions of FC .csv files
processed_path = os.getcwd() + '/data/processed/' #fuzzy_matched files
model_path = os.getcwd() + '/data/model/' #datasets to be used for modeling

# Data Acquisition

### stats.nba scraper

In [2]:
# Initiate driver instance
driver_path = '/usr/bin/chromedriver'
driver = webdriver.Chrome(executable_path = driver_path)

In [5]:
nba.generate_game_log_range(21800001, 21801230, 1, driver, current_season_path)

Attempt 1: Game ID 0021801034 pickled successfully
Attempt 1: Game ID 0021801035 pickled successfully


### FantasyCruncher.com .csv automated download

In [None]:
# Generate date ranges to automate csv download process
def generate_date_range(starting_game, ending_game):

    starting_date = datetime.strptime(str(starting_game), '%Y%m%d')
    ending_date = datetime.strptime(str(ending_game), '%Y%m%d')
    date_list = []
    season_length = (ending_date - starting_date).days
    starting_date -= timedelta(days = 1)
    game_date = starting_date

    for day_num in range(0, season_length + 1):

        game_date += timedelta(days = 1)
        date_list.append(datetime.strftime(game_date, '%Y-%m-%d'))
    
    return date_list

In [None]:
# Grabbing csv files from fantasy cruncher. You will need your own login information
starting_game = 20181016
ending_game = 20190410
date_list = generate_date_range(starting_game, ending_game)

driver.get('https://www.fantasycruncher.com/lineup-rewind/')
time.sleep(1)
username = driver.find_element_by_id('user_email')
username.clear()
username.send_keys('########') # enter your own FantasyCruncher email

password = driver.find_element_by_id('user_password')
password.clear()
#password.send_keys('#######') # enter your own FantasyCruncher password

driver.find_element_by_id('submit').click()

for date in date_list:
    driver.get('https://www.fantasycruncher.com/lineup-rewind/draftkings/NBA/{}'.format(date))
    time.sleep(3)
    
    driver.find_element_by_id('table-actions').click()
    driver.find_elements_by_class_name('table-actions-option')[3].click()
    time.sleep(2)

In [12]:
# Generate date-list 
starting_game = 20181016
ending_game = 20190410

starting_date = datetime.strptime(str(starting_game - 1), '%Y%m%d')
ending_date = datetime.strptime(str(ending_game), '%Y%m%d')
date_list = []
season_length = (ending_date - starting_date).days
game_date = starting_date

# This creates the dataframe for fantasy-cruncher csv files. 
for day_num in range(0, season_length):

    game_date += timedelta(days = 1)
    date_list.append(datetime.strftime(game_date, '%Y-%m-%d'))
    
daily_dfs = []
for game_date in date_list:
    daily_dfs.append(pd.read_pickle(fc_pickled_path + 'draftkings_NBA_{}_players'.format(game_date)))
    df_fc = pd.concat(daily_dfs, axis = 0, ignore_index = True)  
    df_fc.to_pickle(fc_pickled_path + 'fc_2018-19')

# DataFrame Cleaning and Merging

In [55]:
df_fc = pd.read_pickle(fc_pickled_path + 'fc_2018-19')

In [56]:
df = nba.merge_game_logs(21800001, 21801230, raw_path)

In [57]:
'''
This is outdated code. The scraper for stats.nba.com was adjusted for changes to the site and 
was streamlined. The functions found in stats_nba_scraper.py are updated. Rather than running the scraper 
again, the original pickled files are used to merge since the information is still the same.
'''
def clean_merged_game_log(df):
    df.rename(columns = {df.columns[10]: 'MP'}, inplace = True)
    df.drop(['MIN_y'], axis = 1, inplace = True)
    df.drop(['MIN'], axis = 1, inplace = True)
    df.drop([''], axis = 1, inplace = True)
    df = df.loc[:,~df.columns.duplicated()]

    dnp_descriptions = ['DND - Injury/Illness', 'DNP - Injury/Illness', 'NWT - Suspended', 'NWT - Personal', 'NWT - Injury/Illness',
            'DND - Rest', 'NWT - Trade Pending', 'DND - Personal', 'DND_COACH', 'NWT_COACH', 'NWT - Trade Pending', 'DNP - Coach\'s Decision']
    dnp_desc = [x if x in dnp_descriptions else 'Active' for x in df['FGM']]
    df.insert(11,'DND_TAG',dnp_desc)

    df['MP'] = df['MP'].map(lambda x: '00:00' if x == '' else x)

    df.loc[df['DND_TAG'] == 'DND - Injury/Illness', 'DND_TAG'] = 'DND_INJ'
    df.loc[df['DND_TAG'] == 'DNP - Injury/Illness', 'DND_TAG'] = 'DNP_INJ'
    df.loc[df['DND_TAG'] == 'NWT - Suspended', 'DND_TAG'] = 'NWT_SUS'
    df.loc[df['DND_TAG'] == 'NWT - Personal', 'DND_TAG'] = 'NWT_PER'
    df.loc[df['DND_TAG'] == 'NWT - Injury/Illness', 'DND_TAG'] = 'NWT_INJ'
    df.loc[df['DND_TAG'] == 'DND - Rest', 'DND_TAG'] = 'DND_REST'
    df.loc[df['DND_TAG'] == 'NWT - Trade Pending', 'DND_TAG'] = 'TR_PEN'
    df.loc[df['DND_TAG'] == 'DND - Personal', 'DND_TAG'] = 'DND_PER'
    df.loc[df['DND_TAG'] == 'DND_COACH', 'DND_TAG'] = 'DND_COA'
    df.loc[df['DND_TAG'] == 'NWT_COACH', 'DND_TAG'] = 'NWT_COA'
    df.loc[df['DND_TAG'] == 'NWT - Trade Pending', 'DND_TAG'] = 'TR_PEN'
    df.loc[df['DND_TAG'] == 'DNP - Coach\'s Decision', 'DND_TAG'] = 'DNP_COA'

    df.replace('DNP - Coach\'s Decision', 0, inplace = True)
    df.replace('DND - Injury/Illness', 0, inplace = True)
    df.replace('DNP - Injury/Illness', 0, inplace = True)
    df.replace('NWT - Suspended', 0, inplace = True)
    df.replace('NWT - Personal', 0, inplace = True)
    df.replace('NWT - Injury/Illness', 0, inplace = True)
    df.replace('DND - Rest', 0, inplace = True)
    df.replace('NWT - Trade Pending', 0, inplace = True)
    df.replace('DND - Personal', 0, inplace = True)
    df.replace('DND_COACH', 0, inplace = True)
    df.replace('NWT_COACH', 0, inplace = True)
    df.replace('3,200.00', 0, inplace = True)
    df.replace('9,600.00', 0, inplace = True)
    df.replace('1,440.06', 0, inplace = True)
    df.replace('14,405.76', 0, inplace = True)
    df.replace('1,200.00', 0, inplace = True)
    df.replace('2,400.00', 0, inplace = True)
    df.replace('1,066.67', 0, inplace = True)
    df.replace('1,440.01', 0, inplace = True)
    df.replace('1,107.73', 0, inplace = True)
    df.replace('-', 0, inplace = True)

    df.fillna(0, inplace = True)

    position_tag_cleaning = [' F', ' C', ' G']
    df['Player'] = df['Player'].map(lambda x: x[:-2] if x[-2:] in position_tag_cleaning else x)

    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format = True)

    df['MP'] = df['MP'].map(lambda x: x.replace(':60',':59') if x[-2:] in x else x)
    minutes_converted = [ round(datetime.strptime(x,'%M:%S').minute + datetime.strptime(x,'%M:%S').second/60, 4) for x in df['MP']]
    df.insert(11, 'MIN', minutes_converted)
    #df['MP'] = df['MP'].map(lambda x: datetime.strptime(x, '%M:%S'))

    # Convert all required columns to numeric dtypes. to_numeric will decide on int or float
    columns_to_numeric = df.columns.to_list()[13:]
    columns_to_numeric.insert(0, 'Opp_Score')
    columns_to_numeric.insert(0, 'Team_Score')
    df[columns_to_numeric] = df[columns_to_numeric].apply(pd.to_numeric)

    df.rename(columns = lambda x: x.upper(), inplace = True)

    return df

In [None]:
df = clean_merged_game_log(df)

In [59]:
positions = ['F', 'G', 'C']
player_names_uncleaned = sorted(list(df['PLAYER'].unique()))

cleaned_list = []
for player in player_names_uncleaned:
    if player[-1:] in positions:
        player = player[:-2].strip()
        cleaned_list.append(player)
    else:
        player = player.strip()
        cleaned_list.append(player)

unique_nba_names = sorted(list(set(cleaned_list)))

fc_names_uncleaned = sorted(list(df_fc['Player'].unique()))
unique_fc_names = [name.strip() for name in fc_names_uncleaned]

name_df = pd.DataFrame(unique_nba_names)
fc_name_df = pd.DataFrame(unique_fc_names)

In [60]:
name = (len(name_df) * 'NBA ').split(' ')
name = list(filter(None, name))
name_df['source'] = name

fc_name = (len(fc_name_df) * 'FC ').split(' ')
fc_name = list(filter(None, fc_name))
fc_name_df['source'] = fc_name

name_merge_df = pd.merge(name_df, fc_name_df, how = 'outer', on = 0, suffixes = ('_n','_f'))

unmatched_names = name_merge_df[~((name_merge_df['source_n'] == 'NBA')&
              (name_merge_df['source_f'] == 'FC'))]

unmatched_names.sort_values(by=0)

In [61]:
fc_to_nba_names = {
    'B.J. Johnson': 'BJ Johnson',
    'Bruce Brown Jr.': 'Bruce Brown Jr',
    'C.J. McCollum': 'CJ McCollum',
    'C.J. Miles': 'CJ Miles',
    'Cristiano Da Silva Felicio': 'Cristiano Felicio',
    'D.J. Stephens': 'DJ Stephens',
    'Danuel House': 'Danuel House Jr.',
    'DeAndre Bembry': 'DeAndre\' Bembry',
    'Frank Mason III': 'Frank Mason',
    'Guillermo Hernangomez': 'Willy Hernangomez',
    'Harry Giles': 'Harry Giles III',
    'Ishmael Smith': 'Ish Smith',
    'J.J. Redick': 'JJ Redick',
    'J.R. Smith': 'JR Smith',
    'Jacob Evans III': 'Jacob Evans',
    'James Ennis': 'James Ennis III',
    'Juan Hernangomez': 'Juancho Hernangomez',
    'Juan Jose Barea': 'J.J. Barea',
    'Louis Williams': 'Lou Williams',
    'Melvin Frazier': 'Melvin Frazier Jr.',
    'Mitch Creek': 'Mitchell Creek',
    'Mohamed Bamba': 'Mo Bamba',
    'Naz Long': 'Naz Mitrou-Long',
    'Nene Hilario': 'Nene',
    'Otto Porter': 'Otto Porter Jr.',
    'P.J. Tucker': 'PJ Tucker',
    'Patrick Mills': 'Patty Mills',
    'R.J. Hunter': 'RJ Hunter',
    'Raulzinho Neto': 'Raul Neto',
    'Robert Williams': 'Robert Williams III',
    'Troy Brown': 'Troy Brown Jr.',
    'Wade Baldwin': 'Wade Baldwin IV',
    'Walt Lemon Jr.': 'Walter Lemon Jr.',
    'Wesley Iwundu': 'Wes Iwundu',
}



In [62]:
df_fc['Player'] = df_fc['Player'].map(fc_to_nba_names).fillna(df_fc['Player'])
df_fc['Date'] = pd.to_datetime(df_fc['Date'], infer_datetime_format = True)
df['DATE'] = pd.to_datetime(df['DATE'], infer_datetime_format = True)

In [63]:
df['PLAYER'] = df['PLAYER'].apply(lambda x: x.strip())

In [64]:
df['merge'] = df['PLAYER'] + ' ' + df['DATE'].apply(lambda x: datetime.strftime(x, '%m-%d-%y'))
df_fc['merge'] = df_fc['Player'] + ' ' + df_fc['Date'].apply(lambda x: datetime.strftime(x, '%m-%d-%y'))
df_fc.drop(['Player', 'Team', 'Opp', 'Date', 'FGA', 'Asts', 'Mins'], axis = 1, inplace = True)
df = df.merge(df_fc, how = 'left', on = 'merge')

In [65]:
df.to_pickle(model_path + '2018-19_season_fc_merged_v2')