In [80]:
import pandas as pd
import os.path
import time
from os import path
from pybaseball import playerid_lookup
from pybaseball import pitching_stats_range
from datetime import datetime, timedelta

In [106]:
# fixed params – don't change
lookback = 30 # days
pitch_pull_cols = ['H','BB','SO','HR','HBP','AB','2B','3B','IBB','SF','SB','GB/FB','LD','SO/W'] # limits file size
cutoff_18 = 20180501
cutoff_17 = 20170501
cutoff_16 = 20160501

## Starting with 2018

In [16]:
bs_2018 = pd.read_csv('data/GL2018.TXT', header=None)

In [17]:
# keep games that were completed same day
dates_18 = bs_2018.loc[bs_2018[13].isna(), 0] 

In [19]:
dates_18 = dates_18[dates_18 >= cutoff_18].unique()

Now I have all the dates in 2018 that I need to query for.

In [86]:
# this dataframe will track which dates have been dealt with.
track_18 = pd.DataFrame(columns=['date', 'local'])
track_18.date = dates_18
if not path.exists('track_18.csv'): # initiation
    track_18.local = 0 # 1 means dealt with
    track_18.to_csv(path_or_buf = 'track_18.csv', index=False)

In [91]:
def pitching_stats(date, lookback=30):
    start_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=lookback)).strftime('%Y-%m-%d')
    end_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=1)).strftime('%Y-%m-%d')
    try:
        pitch_stats = pitching_stats_range(start_dt, end_dt)
        return pitch_stats
    except IndexError:
        print('No pitching stats within range!')  

track_18 = pd.read_csv('track_18.csv')        

def localize(track=track_18, start=0, end=500, timeout=2.0):
    for r in track.index[start:end]:
        rdate = track.iloc[r, 0]
        rlocal = track.iloc[r, 1]
        if rlocal == 0:
            
            pitch_stats = pitching_stats(rdate)
            time.sleep(timeout)
            
            pitch_stats.to_csv(path_or_buf = 'pitch/{}.csv'.format(str(rdate)), index=False)
            track.iloc[r, 1] = 1
            track.to_csv(path_or_buf = 'track_18.csv', index=False)
            print('{} Done!'.format(str(rdate)))
    print('All Done!')

In [93]:
localize()

20180501 Done!
20180502 Done!
20180503 Done!
20180504 Done!
20180505 Done!
20180506 Done!
20180507 Done!
20180508 Done!
20180509 Done!
20180510 Done!
20180511 Done!
20180512 Done!
20180513 Done!
20180514 Done!
20180515 Done!
20180516 Done!
20180517 Done!
20180518 Done!
20180519 Done!
20180520 Done!
20180521 Done!
20180522 Done!
20180523 Done!
20180524 Done!
20180525 Done!
20180526 Done!
20180527 Done!
20180528 Done!
20180529 Done!
20180530 Done!
20180531 Done!
20180601 Done!
20180602 Done!
20180603 Done!
20180604 Done!
20180605 Done!
20180606 Done!
20180607 Done!
20180608 Done!
20180609 Done!
20180610 Done!
20180611 Done!
20180612 Done!
20180613 Done!
20180614 Done!
20180615 Done!
20180616 Done!
20180617 Done!
20180618 Done!
20180619 Done!
20180620 Done!
20180621 Done!
20180622 Done!
20180623 Done!
20180624 Done!
20180625 Done!
20180626 Done!
20180627 Done!
20180628 Done!
20180629 Done!
20180630 Done!
20180701 Done!
20180702 Done!
20180703 Done!
20180704 Done!
20180705 Done!
20180706 D

## Now deal with 2017

In [102]:
bs_2017 = pd.read_csv('data/GL2017.TXT', header=None)
# keep games that were completed same day
dates_17 = bs_2017.loc[bs_2017[13].isna(), 0]
dates_17 = dates_17[dates_17 >= cutoff_17].unique()

# this dataframe will track which dates have been dealt with.
track_17 = pd.DataFrame(columns=['date', 'local'])
track_17.date = dates_17
if not path.exists('track_17.csv'): # initiation
    track_17.local = 0 # 1 means dealt with
    track_17.to_csv(path_or_buf = 'track_17.csv', index=False)

In [103]:
def pitching_stats(date, lookback=30):
    start_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=lookback)).strftime('%Y-%m-%d')
    end_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=1)).strftime('%Y-%m-%d')
    try:
        pitch_stats = pitching_stats_range(start_dt, end_dt)
        return pitch_stats
    except IndexError:
        print('No pitching stats within range!')  

track_17 = pd.read_csv('track_17.csv')        

def localize(track=track_17, start=0, end=500, timeout=2.0):
    for r in track.index[start:end]:
        rdate = track.iloc[r, 0]
        rlocal = track.iloc[r, 1]
        if rlocal == 0:
            
            pitch_stats = pitching_stats(rdate)
            time.sleep(timeout)
            
            pitch_stats.to_csv(path_or_buf = 'pitch/{}.csv'.format(str(rdate)), index=False)
            track.iloc[r, 1] = 1
            track.to_csv(path_or_buf = 'track_17.csv', index=False)
            print('{} Done!'.format(str(rdate)))
    print('All Done!')

In [104]:
localize()

20170501 Done!
20170502 Done!
20170503 Done!
20170504 Done!
20170505 Done!
20170506 Done!
20170507 Done!
20170508 Done!
20170509 Done!
20170510 Done!
20170511 Done!
20170512 Done!
20170513 Done!
20170514 Done!
20170515 Done!
20170516 Done!
20170517 Done!
20170518 Done!
20170519 Done!
20170520 Done!
20170521 Done!
20170522 Done!
20170523 Done!
20170524 Done!
20170525 Done!
20170526 Done!
20170527 Done!
20170528 Done!
20170529 Done!
20170530 Done!
20170531 Done!
20170601 Done!
20170602 Done!
20170603 Done!
20170604 Done!
20170605 Done!
20170606 Done!
20170607 Done!
20170608 Done!
20170609 Done!
20170610 Done!
20170611 Done!
20170612 Done!
20170613 Done!
20170614 Done!
20170615 Done!
20170616 Done!
20170617 Done!
20170618 Done!
20170619 Done!
20170620 Done!
20170621 Done!
20170622 Done!
20170623 Done!
20170624 Done!
20170625 Done!
20170626 Done!
20170627 Done!
20170628 Done!
20170629 Done!
20170630 Done!
20170701 Done!
20170702 Done!
20170703 Done!
20170704 Done!
20170705 Done!
20170706 D

## Finally with 2016

In [107]:
bs_2016 = pd.read_csv('data/GL2016.TXT', header=None)
# keep games that were completed same day
dates_16 = bs_2016.loc[bs_2016[13].isna(), 0]
dates_16 = dates_16[dates_16 >= cutoff_16].unique()

# this dataframe will track which dates have been dealt with.
track_16 = pd.DataFrame(columns=['date', 'local'])
track_16.date = dates_16
if not path.exists('track_16.csv'): # initiation
    track_16.local = 0 # 1 means dealt with
    track_16.to_csv(path_or_buf = 'track_16.csv', index=False)

In [108]:
def pitching_stats(date, lookback=30):
    start_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=lookback)).strftime('%Y-%m-%d')
    end_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=1)).strftime('%Y-%m-%d')
    try:
        pitch_stats = pitching_stats_range(start_dt, end_dt)
        return pitch_stats
    except IndexError:
        print('No pitching stats within range!')  

track_16 = pd.read_csv('track_16.csv')        

def localize(track=track_16, start=0, end=500, timeout=2.0):
    for r in track.index[start:end]:
        rdate = track.iloc[r, 0]
        rlocal = track.iloc[r, 1]
        if rlocal == 0:
            
            pitch_stats = pitching_stats(rdate)
            time.sleep(timeout)
            
            pitch_stats.to_csv(path_or_buf = 'pitch/{}.csv'.format(str(rdate)), index=False)
            track.iloc[r, 1] = 1
            track.to_csv(path_or_buf = 'track_16.csv', index=False)
            print('{} Done!'.format(str(rdate)))
    print('All Done!')

In [109]:
localize()

20160501 Done!
20160502 Done!
20160503 Done!
20160504 Done!
20160505 Done!
20160506 Done!
20160507 Done!
20160508 Done!
20160509 Done!
20160510 Done!
20160511 Done!
20160512 Done!
20160513 Done!
20160514 Done!
20160515 Done!
20160516 Done!
20160517 Done!
20160518 Done!
20160519 Done!
20160520 Done!
20160521 Done!
20160522 Done!
20160523 Done!
20160524 Done!
20160525 Done!
20160526 Done!
20160527 Done!
20160528 Done!
20160529 Done!
20160530 Done!
20160531 Done!
20160601 Done!
20160602 Done!
20160603 Done!
20160604 Done!
20160605 Done!
20160606 Done!
20160607 Done!
20160608 Done!
20160609 Done!
20160610 Done!
20160611 Done!
20160612 Done!
20160613 Done!
20160614 Done!
20160615 Done!
20160616 Done!
20160617 Done!
20160618 Done!
20160619 Done!
20160620 Done!
20160621 Done!
20160622 Done!
20160623 Done!
20160624 Done!
20160625 Done!
20160626 Done!
20160627 Done!
20160628 Done!
20160629 Done!
20160630 Done!
20160701 Done!
20160702 Done!
20160703 Done!
20160704 Done!
20160705 Done!
20160706 D