In [1]:
import pandas as pd
import numpy as np
import heapq
from haversine import haversine, Unit
pd.set_option('display.max_columns', None)

### Read and Clean Data

In [2]:
tracks_df = pd.read_excel('../input_data/Nascar -Arca _1980_2020_A.xlsx')

In [3]:
daily_lead_df = pd.read_csv('../input_data/daily_lead_80_20.csv', encoding = 'latin1')

In [4]:
daily_lead_df['date'] = pd.to_datetime(daily_lead_df['date1'])
filt_lead_df = daily_lead_df.drop_duplicates(["monitorID"]).copy()
filt_lead_df['latitude'] = filt_lead_df['latitude'].astype(float)
filt_lead_df['longitude'] = filt_lead_df['longitude'].astype(float)
filt_lead_df['point'] = [(x, y) for x,y in zip(filt_lead_df['latitude'], filt_lead_df['longitude'])]

In [5]:
tracks_df["long"].replace({"-\u200b110.790919": -110.790919, "-\u200b95.686195": 95.686195}, inplace=True)

In [6]:
tracks_df['lat'] = tracks_df['lat'].astype(float)
tracks_df['long'] = tracks_df['long'].astype(float)
tracks_df['point'] = [(x, y) for x,y in zip(tracks_df['lat'], tracks_df['long'])]

### Find Stations within 4 km

In [7]:
def get_stations_within_thresh(point, points, monitor_ids, distance_thresh=4):
    # finds distance between each track and every monitoring station
    dist = []
    for p in points:
        x = round(haversine(point, p), 4)
        dist.append(x)
    
    idxs = [i for i,v in enumerate(dist) if v <= distance_thresh]
    
    n_smallest_points = [points[idx] for idx in idxs] # monitor coordinates
    n_smallest_ids = [monitor_ids[idx] for idx in idxs] # monitor IDs
    n_smallest_dist = [dist[idx] for idx in idxs]

    return n_smallest_points, n_smallest_ids, n_smallest_dist

In [8]:
monitor_points = list(filt_lead_df['point'])
monitor_ids = list(filt_lead_df["monitorID"])

In [9]:
distance_thresh = 4
tracks_df['points_ids_dist'] = [get_stations_within_thresh(x, monitor_points, monitor_ids, distance_thresh) 
                                          for x in tracks_df['point']]

In [10]:
tracks_df["points_within_4km"] = tracks_df["points_ids_dist"].apply(lambda x: x[0])
tracks_df["ids_within_4km"] = tracks_df["points_ids_dist"].apply(lambda x: x[1])
tracks_df["distances (km)"] = tracks_df["points_ids_dist"].apply(lambda x: x[2])

In [11]:
stations_within_4km = tracks_df[tracks_df['ids_within_4km'].map(lambda d: len(d)) > 0].reset_index(drop=True).copy()

In [12]:
stations_within_4km_df = stations_within_4km.drop(['points_ids_dist'], axis=1).copy()

In [13]:
stations_within_4km_list = stations_within_4km_df.copy()

In [14]:
point_cols = []
for i in range(1,6):
    new_col = f"closest_point_{i}"
    point_cols.append(new_col)

In [15]:
stations_within_4km_df[point_cols] = pd.DataFrame(stations_within_4km_df["points_within_4km"].tolist())

In [16]:
id_cols = []
for i in range(1,6):
    new_col = f"closest_id_{i}"
    id_cols.append(new_col)

In [17]:
stations_within_4km_df[id_cols] = pd.DataFrame(stations_within_4km_df["ids_within_4km"].tolist())

In [18]:
dist_cols = []
for i in range(1,6):
    new_col = f"closest_dist_{i}"
    dist_cols.append(new_col)

In [19]:
stations_within_4km_df[dist_cols] = pd.DataFrame(stations_within_4km_df["distances (km)"].tolist())

In [20]:
stations_within_4km_df.drop(['points_within_4km', 'ids_within_4km', 'distances (km)'], axis=1)

Unnamed: 0,track name,H_R,City,State,lat,long,NASCAR_truck,NASCAR,ARCA,point,closest_point_1,closest_point_2,closest_point_3,closest_point_4,closest_point_5,closest_id_1,closest_id_2,closest_id_3,closest_id_4,closest_id_5,closest_dist_1,closest_dist_2,closest_dist_3,closest_dist_4,closest_dist_5
0,Auto Club Speedway - California Speedway,1,Fontana,CA,34.088243,-117.499426,1,1,0,"(34.088243, -117.499426)","(34.106121, -117.48199)","(34.100021000000005, -117.49201000000001)","(34.093979, -117.52725)",,,2235,2154.0,2092.0,,,2.5553,1.477,2.6404,,
1,Chicago Motor Speedway,1,Cicero,IL,41.824997,-87.742497,1,0,0,"(41.8249967, -87.74249703)","(41.859753000000005, -87.750336)",,,,,2378,,,,,3.9189,,,,
2,Chicagoland Speedway,1,Joliet,IL,41.474411,-88.058763,1,1,1,"(41.474411, -88.058763)","(41.505032, -88.06867199999999)",,,,,693,,,,,3.5035,,,,
3,Indiana State Fairgrounds,1,Indianapolis,IN,39.829722,-86.133889,0,0,1,"(39.8297222, -86.1338889)","(39.802818, -86.120262)","(39.810833, -86.114441)",,,,1324,1524.0,,,,3.21,2.6777,,,
4,Indianapolis Motor Speedway,1,Speedway,IN,39.796248,-86.234816,0,1,0,"(39.796248, -86.234816)","(39.771709, -86.214157)",,,,,1332,,,,,3.2499,,,,
5,Kansas Speedway,0,Kansas City,KS,39.115812,-94.830604,1,1,0,"(39.115812, -94.830604)","(39.118336, -94.798576)",,,,,937,,,,,2.7773,,,,
6,Portland International Raceway,0,Portland,OR,45.591498,-122.690831,1,0,0,"(45.591497634, -122.69083057)","(45.561371, -122.6679)",,,,,1639,,,,,3.7957,,,,
7,"Portland Speedway, Rose City Speeeway",0,Portland,OR,45.5378,-122.6059,1,0,0,"(45.5378, -122.6059)","(45.527306, -122.60951999999999)","(45.523140000000005, -122.64674)",,,,1534,1096.0,,,,1.2005,3.5746,,,
8,Rambi Raceway,1,Myrtle Beach,SC,33.68906,-78.88669,0,1,0,"(33.68906, -78.88669)","(33.7094, -78.877449)","(33.70277, -78.87748)",,,,145,301.0,,,,2.4179,1.7464,,,
9,Bristol Motor Speedway - Bristol International...,1,Bristol,TN,36.515699,-82.257008,1,1,1,"(36.515699, -82.257008)","(36.524723, -82.268059)","(36.525555, -82.273331)","(36.524723, -82.268059)","(36.528057000000004, -82.268333)","(36.524433, -82.272614)",2338,2261.0,2279.0,2286.0,2290.0,1.4079,1.8245,1.4079,1.7066,1.6994


In [21]:
long = []
for index, row in stations_within_4km_df.iterrows():
    for i in range(1,6):
        lis =  row[["track name", 'H_R', 'City', 'State', 'lat', 'long', 'NASCAR_truck', 'NASCAR', 'ARCA',
                    f"closest_point_{i}", f"closest_id_{i}", f"closest_dist_{i}"]].values.flatten().tolist()
        long.append(lis)

In [22]:
df_cols = ["track name", 'H_R', 'City', 'State', 'lat', 'long', 'NASCAR_truck', 'NASCAR', 'ARCA',
          'closest_stations', 'closest_ids', 'closest_dist (km)']

In [23]:
stations_4km = pd.DataFrame(long, columns=df_cols)

In [24]:
stations_4km = stations_4km.dropna(axis=0, subset=['closest_stations'])

In [25]:
stations_within_4km = stations_4km.rename(columns={'closest_stations': 'coordinates_within_4k', 
                             'closest_ids': 'station_IDs',
                              'closest_dist (km)': 'distance_from_track (km)'}).copy()

In [26]:
stations_within_4km_df = stations_within_4km.astype({'station_IDs': 'int32'}).copy()

In [27]:
stations_within_4km_df = stations_within_4km_df.reset_index(drop=True).copy()

In [28]:
stations_within_4km_df.head()

Unnamed: 0,track name,H_R,City,State,lat,long,NASCAR_truck,NASCAR,ARCA,coordinates_within_4k,station_IDs,distance_from_track (km)
0,Auto Club Speedway - California Speedway,1,Fontana,CA,34.088243,-117.499426,1,1,0,"(34.106121, -117.48199)",2235,2.5553
1,Auto Club Speedway - California Speedway,1,Fontana,CA,34.088243,-117.499426,1,1,0,"(34.100021000000005, -117.49201000000001)",2154,1.477
2,Auto Club Speedway - California Speedway,1,Fontana,CA,34.088243,-117.499426,1,1,0,"(34.093979, -117.52725)",2092,2.6404
3,Chicago Motor Speedway,1,Cicero,IL,41.824997,-87.742497,1,0,0,"(41.859753000000005, -87.750336)",2378,3.9189
4,Chicagoland Speedway,1,Joliet,IL,41.474411,-88.058763,1,1,1,"(41.505032, -88.06867199999999)",693,3.5035


In [29]:
stations_within_4km_df.to_csv('stations_within_4km_df.csv')

In [30]:
stations_within_4km_df.to_pickle("../../21_6_14/notebooks/stations_within_4km_df.pkl")

### Exploring station data within 4km of each track

In [31]:
id_list = list(stations_within_4km_df['station_IDs'])

In [32]:
id_list

[2235,
 2154,
 2092,
 2378,
 693,
 1324,
 1524,
 1332,
 937,
 1639,
 1534,
 1096,
 145,
 301,
 2338,
 2261,
 2279,
 2286,
 2290,
 303]

In [33]:
station_data_4km = daily_lead_df[(daily_lead_df['monitorID'] == 303) | (daily_lead_df['monitorID'] == 693) 
                | (daily_lead_df['monitorID'] ==  2235) |(daily_lead_df['monitorID'] == 2154)
                | (daily_lead_df['monitorID'] == 2092)  | (daily_lead_df['monitorID'] == 2338)
                | (daily_lead_df['monitorID'] == 2261)  | (daily_lead_df['monitorID'] == 2279) 
                | (daily_lead_df['monitorID'] == 2290)  | (daily_lead_df['monitorID'] == 2286)
                | (daily_lead_df['monitorID'] == 1332)  | (daily_lead_df['monitorID'] == 937)].reset_index(drop=True).copy()

In [34]:
station_data_4km['monitorID'].value_counts()

2279    4735
2286    2329
2261    2308
2290    1150
2338     941
303      540
2154     333
2092     261
2235     129
1332      44
693       18
937       16
Name: monitorID, dtype: int64

### How many times does each station record, per N years, over the 40 year period?

In [35]:
n_years = 1
(station_data_4km.groupby(station_data_4km.year.sub(1980)//n_years)['monitorID'].value_counts().to_frame()
             .rename(columns={'monitorID': 'count'}).reset_index()
             .rename(columns={'year': 'time_period'}))

Unnamed: 0,time_period,monitorID,count
0,0,303,54
1,0,2235,53
2,0,1332,44
3,0,693,12
4,1,303,59
...,...,...,...
115,37,2290,86
116,37,2092,39
117,38,2290,113
118,39,2290,114


### How many times does each station record, per day, over the 40 year period?

In [36]:
daily_records_df = (station_data_4km.groupby('date')['monitorID'].value_counts()
                                     .to_frame().rename(columns={'monitorID': 'count'})
                                     .reset_index())

In [37]:
daily_records_df.sort_values('count', ascending=False).reset_index(drop=True)

Unnamed: 0,date,monitorID,count
0,2009-02-09,2279,4
1,2009-03-29,2279,4
2,2009-03-23,2279,4
3,2009-03-20,2279,4
4,2009-03-17,2279,4
...,...,...,...
9842,2004-05-12,2261,1
9843,1996-06-08,2338,1
9844,2004-05-09,2338,1
9845,2004-05-06,2286,1


In [38]:
(daily_records_df[(daily_records_df['date'] > '2004-01-01') & (daily_records_df['date'] < '2009-08-31')]).sort_values('count', ascending=False).reset_index(drop=True)

Unnamed: 0,date,monitorID,count
0,2009-03-29,2279,4
1,2009-02-03,2279,4
2,2009-01-25,2279,4
3,2009-02-27,2279,4
4,2009-03-02,2279,4
...,...,...,...
2234,2006-04-05,2338,1
2235,2006-04-03,2286,1
2236,2006-04-03,2261,1
2237,2006-03-30,2338,1
