In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from haversine import haversine, Unit
import itertools
import datetime
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)

### Read in DFs and Clean

In [2]:
# new DF with lat-longs from Google Maps
tracks_df = pd.read_excel('../input_data/Nascar -Arca _1980_2020_updated.xlsx')

In [3]:
daily_lead_df = pd.read_csv('../input_data/daily_lead_80_20.csv')

In [4]:
daily_lead_df['point'] = [(x, y) for x,y in zip(daily_lead_df['latitude'], daily_lead_df['longitude'])]

In [5]:
daily_lead_df['date'] = pd.to_datetime(daily_lead_df['date1'])

In [6]:
filt_lead_df = pd.read_pickle("../../21_6_7/notebooks/daily_lead_unique.pkl")

In [7]:
tracks_df['point'] = [(x, y) for x,y in zip(tracks_df['lat_Google_Maps'], tracks_df['long_Google_Maps'])]

In [8]:
tracks_df_4km = tracks_df.copy()
tracks_df_50mi = tracks_df.copy()

### Find All Stations Within X km of Tracks

In [9]:
monitor_points = list(filt_lead_df['point'])
monitor_ids = list(filt_lead_df["monitorID"])

In [10]:
def get_stations_within_thresh(point, points, monitor_ids, distance_thresh_min, distance_thresh_max, dist_unit):
    
    # finds distance between each track and every monitoring station
    dist = []
    if(dist_unit=='km'): # 1 indicates kilometers, else miles
        for p in points:
            x = round(haversine(point, p), 4)
            dist.append(x)
    elif(dist_unit=='mi'):
        for p in points:
            x = round(haversine(point, p, unit=Unit.MILES), 4)
            dist.append(x)
    else:
        raise ValueError('wrong dist unit')
    
    idxs = [i for i,v in enumerate(dist) if (v >= distance_thresh_min and v <= distance_thresh_max)]

    n_smallest_points = [points[idx] for idx in idxs] # monitor coordinates
    n_smallest_ids = [monitor_ids[idx] for idx in idxs] # monitor IDs
    n_smallest_dist = [dist[idx] for idx in idxs]

    return n_smallest_points, n_smallest_ids, n_smallest_dist

In [11]:
def pass_dist_thresh(df, distance_thresh_min, distance_thresh_max, dist_unit):
    df = df.copy()
    df['points_ids_dist'] = [get_stations_within_thresh(x, monitor_points, monitor_ids, distance_thresh_min, distance_thresh_max, dist_unit) 
                                              for x in df['point']]
    dis =''
    if (distance_thresh_max) == 50:
        dis = '50mi'
    elif (distance_thresh_max) == 6:
        dis = '4-6km'
    elif(distance_thresh_max) == 4:
        dis = '4km'
           
    df[f'points_within_{dis}'] = df["points_ids_dist"].apply(lambda x: x[0])
    df['ids'] = df["points_ids_dist"].apply(lambda x: x[1])
    df['distances'] = df["points_ids_dist"].apply(lambda x: x[2])
    df['num_stations'] = [len(x) for x in df[f'points_within_{dis}']]
    
    df = df.query("num_stations != 0").copy()
    df[f'mean ({dist_unit})'] = [round(np.array(x).mean(),4) for x in df['distances'].values]
    df[f'std ({dist_unit})'] = [round(pd.array(x).std(),4) for x in df['distances'].values]
    df[f'max_dist ({dist_unit})'] = [max(x) for x in df['distances']]
    df[f'min_dist ({dist_unit})'] = [min(x) for x in df['distances']]
    
    return df

### Find All Stations Within 4K of Tracks

In [12]:
stations_4km_df = (pass_dist_thresh(tracks_df_4km, 0, 4, 'km')
                                  .drop(['points_ids_dist'], axis=1)
                                  .reset_index(drop=True).copy())

### Get Avg Pb Readings Pre and Post Races W Distance From Each Monitor to Track (km) 

In [13]:
all_race_dates = pd.read_excel('../input_data/race_dates_4_6.xlsx')

In [14]:
all_race_dates['Date'] = pd.to_datetime(all_race_dates['Date'], errors='coerce')

In [15]:
all_race_dates = all_race_dates.rename(columns = {'Unnamed: 0' : 'Track'})

In [16]:
track_names_04 = ['Autoclub Speedway, CA', 'Chicago Motor Speedway', 'Indiana State Fairgrounds (ARCA)',
'Indianapolis Motor Speedway', 'Kansas Speedway', 'Nazareth','Bristol Speedway',
"Nashville Int'l Raceway/Nashville Fairgrounds Speedway", 'Richmond']

In [17]:
track_names_4km = ['Autoclub Speedway, CA', 'Bristol Speedway', 'Richmond']

In [18]:
def get_tracks(track_names, all_race_dates): 
    race_dates_dfs = []
    for x in track_names:
        df = all_race_dates[all_race_dates['Track']== x].reset_index(drop=True)
        race_dates_dfs.append(df)
    return race_dates_dfs

In [19]:
def readings_pre_post_race(stations_df, daily_lead_df, check_date, ids):
   
    mask = (daily_lead_df['date']== check_date) 
   
    df = daily_lead_df.loc[mask]

    empty = True
    out_ids = []
    out_distances = []
    #out_means = []    
    
    for ID in ids:
        
        lead_id_df = df[(df['monitorID']==ID)]
        
        if not lead_id_df.empty:
            empty = False
            
            pb_lvl = round(lead_id_df['Pb_mean'].mean(),5)
          
            idx = ids.index(ID)
            distance_from_track = stations_df['distances'].iloc[0][idx]
            
            out_ids.append(ID)
            out_distances.append(distance_from_track)
            
#             dup_date = lead_id_df[lead_id_df["date"].duplicated()]
            
#             if len(dup_date) > 0:
#                 min_pb = dup_date['Pb_mean'].min()
#                 idx = lead_id_df[lead_id_df['Pb_mean']==min_pb].index
                
#                 new_pb_df = lead_id_df.drop(index=idx)
                
#                 pb_lvl = round((new_pb_df['Pb_mean'].sum() + min_pb)  / (len(new_pb_df)+1),5)             
            
    if empty == False:
        return out_ids, out_distances, pb_lvl
    else:
        return -1, -1, -1

In [25]:
def get_each_race(race_dates_dfs, stations_df, daily_lead_df):
    race_df_list = []
        
    days =  datetime.timedelta(days = 6)
    one_week =  datetime.timedelta(days = 7)

    for name, race_df in race_dates_dfs.groupby("Track"):  # USE NAME 
        
        race_dates = set(list(race_df["Date"]))
        
        filt_stations_df = stations_df[stations_df["track name"].str.contains(name[:4])].copy()
        
        if len(filt_stations_df)==0: ### race track not found in filt_stations_df,
                                    # could be 0,1,2,3 of the 3 tracks based on monitors found within dist window
            continue
            
        assert(len(filt_stations_df)==1) # one track at a time
        
        station_ids = filt_stations_df["ids"].iloc[0]
        
        for race_date in race_dates:        

            filt_race_date_df = race_df[race_df["Date"]==race_date].copy()

            
            ## pre 
            
            pre_dates = []
            pre_ids = set()
            pre_dist = set()
            pre_pb = []

            
            check_date = race_date - one_week

            for i in range(10):
            
                ids, distances, means = readings_pre_post_race(filt_stations_df, daily_lead_df, check_date, station_ids)     
                if ids != -1:
                    for x in ids:
                        if x not in pre_ids:
                            pre_ids.add(x)
                    for x in distances:
                        if x not in pre_dist:
                            pre_dist.add(x)
                            
                    
                    pre_pb.append(means)
                    pre_dates.append(check_date.strftime('%Y-%m-%d'))
                   
                check_date = check_date - days
                
            
            if ((len(filt_race_date_df)>0) & (len(pre_dates)>0)):
                filt_race_date_df['pre_dates'] = [pre_dates] * len(filt_race_date_df)
                
                filt_race_date_df["pre_ids"] = [pre_ids] * len(filt_race_date_df)
                filt_race_date_df["pre_distances_km"] = [pre_dist] * len(filt_race_date_df)
                filt_race_date_df["pre_pb_readings"] = [pre_pb] * len(filt_race_date_df)
            
                
            post_dates = []
            post_ids = set()
            post_dist = set()
            post_pb = []

                
            ## post 
            
            check_date = race_date 
            for i in range(10):
                ids, distances, means = readings_pre_post_race(filt_stations_df, daily_lead_df, check_date, station_ids)      
                if ids != -1:
                    for x in ids:
                        if x not in post_ids:
                            post_ids.add(x)
                    for x in distances:
                        if x not in post_dist:
                            post_dist.add(x)
                            
                    post_pb.append(means)
                    post_dates.append(check_date.strftime('%Y-%m-%d'))

                check_date = check_date + days
            if ((len(filt_race_date_df)>0) & (len(pre_dates)>0)):
                filt_race_date_df["post_pb_readings"] = [post_pb] * len(filt_race_date_df)
                filt_race_date_df["post_dates"] = [post_dates] * len(filt_race_date_df)
                
                
                filt_race_date_df["post_ids"] = [post_ids] * len(filt_race_date_df)
                filt_race_date_df["post_distances_km"] = [post_dist] * len(filt_race_date_df)
                
           
          
            race_df_list.append(filt_race_date_df)
            
           
    out_df = pd.concat(race_df_list)
    
    return out_df

### Recaculate Post-lead means for :  0-4K, 0-5K, 0-6K, 0-7K 0-8K, 0-9K 0-10K, 10K - max for our 3 tracks

In [26]:
filt_stations_4km_df = (tracks_df_4km[(tracks_df_4km["track name"].str.contains("Auto Club Speedway")) |
                                        (tracks_df_4km["track name"].str.contains("Bristol")) |
                                        (tracks_df_4km["track name"].str.contains("Richmond"))]
                                        .reset_index(drop=True).copy())

race_dates_df = pd.concat(get_tracks(track_names_4km, all_race_dates)).copy()

all_dfs = {}
all_df_list = []
distances = [[0,1], [1,2], [2,3], [3,4], [4,5], [5,6], [6,7], [7,8], [8,9], [9,10],
            [0,4], [0,5], [0,6], [0,7], [0,8], [0,9], [0, 10], [10,80]]

for dist in tqdm(distances): 

    monitors_in_dist_df = (pass_dist_thresh(filt_stations_4km_df, dist[0], dist[1], 'km')
                                  .drop(['points_ids_dist'], axis=1)
                                  .reset_index(drop=True).copy())
    if not monitors_in_dist_df.empty: 
        final_df = get_each_race(race_dates_df, monitors_in_dist_df, daily_lead_df)

        final_df["distance_window"] = [dist] * len(final_df)

        all_df_list.append(final_df)

        all_dfs[repr(dist)] = final_df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [27]:
all_df = pd.concat(all_df_list).dropna().dropna().reset_index(drop=True)

all_df["distance_window"] = all_df["distance_window"].apply(lambda x: repr(x))

In [28]:
def filt_for_date(df, year1, year2):
    filt = (df[(df['Date'] >= f"{year1}-1-1") & (df['Date']<= f"{year2}-12-31")])
    filt_df = (filt.sort_values('Date')
                    .drop_duplicates(subset='Date')
                    .reset_index(drop=True))
    
    if len(filt_df)!=0:
        assert(len(filt_df)<len(df))
        return filt_df
    else:
        print("Empty DF filt for date", year1, year2)
        return pd.DataFrame()

### Results

In [30]:
### note:
##### parentheses represent days before / after
##### brackets represent distance window

### All dfs to csv

In [31]:
for year1, year2 in [(1990,2006), (2008, 2015)]:
    for name, df in all_df.groupby(["distance_window"]):
        
        filt_df = filt_for_date(df, year1, year2)
        if len(filt_df)==0:
            print(name)
            continue
        
        out_name = f"{year1}-{year2}_pre_post_race_lead_{name}.csv"
        out_path = "pre_post_race_lead-2021-08-10/" + out_name
        
        all_df_final = filt_df[filt_df.astype(str)['post_pb_readings'] != '[]']
        
        
        filt_df_final = (all_df_final.drop(columns=['Cars', 'Winner(s)', 'St', 'Make / Model', 'Len', 'Sfc', 'Miles',
                                              'Purse', 'Pole', 'Cau', 'Laps', 'Speed', 'LC'],axis=1)).reset_index(drop=True)

        filt_df_final.to_csv(out_path)

Empty DF filt for date 1990 2006
[2, 3]
Empty DF filt for date 2008 2015
[2, 3]
