# Target and Feature Generation

## Imports

In [21]:
import numpy as np
import pandas as pd

import geopandas as gpd
from shapely.geometry import Point
import dask_geopandas
import dask.dataframe as dd

import matplotlib.pyplot as plt
import plotly.express as px
from plotly.tools import mpl_to_plotly as ggplotly
from plotnine import *

from geopy.distance import geodesic
from geopy.distance import great_circle
from vincenty import vincenty

from concurrent.futures import ThreadPoolExecutor

In [22]:
def npshift(arr, num, fill_value=np.nan):
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = fill_value
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = fill_value
        result[:num] = arr[-num:]
    else:
        result[:] = arr
    return result

def vincenty_robust(start, finish):
    """Multiply units by 1000 to get meters"""
    try:
        return vincenty(tuple(start), tuple(finish))*1000
    except:
        return np.nan

def get_angle_robust(point1, point2):
    try:
        vector_1 = np.array(point1)
        vector_2 = np.array(point2)
        unit_vector_1 = vector_1 / np.linalg.norm(vector_1)
        unit_vector_2 = vector_2 / np.linalg.norm(vector_2)
        dot_product = np.dot(unit_vector_1, unit_vector_2)
        angle = np.arccos(dot_product)
        return angle
    except:
        return np.nan

## Create Target

In [23]:
complete_cols = ['track_id', 'race_date', 'race_number', 'program_number', 'trakus_index', 'latitude', 'longitude', 'distance_id', 'course_type',
                 'track_condition', 'run_up_distance', 'race_type', 'purse', 'post_time', 'weight_carried', 'jockey', 'odds', 'position_at_finish']

complete = pd.read_csv(
    "../data/nyra_2019_complete.csv", 
    header=None,
    names=complete_cols,
    dtype={
        'program_number': str,
        'track_condition': str,
        'race_type':str,
        'post_time':str
        }
                    
    )

complete.head()


Unnamed: 0,track_id,race_date,race_number,program_number,trakus_index,latitude,longitude,distance_id,course_type,track_condition,run_up_distance,race_type,purse,post_time,weight_carried,jockey,odds,position_at_finish
0,AQU,2019-01-01,9,6,72,40.672902,-73.827607,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
1,AQU,2019-01-01,9,6,73,40.672946,-73.827587,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
2,AQU,2019-01-01,9,6,74,40.67299,-73.827568,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
3,AQU,2019-01-01,9,6,63,40.67251,-73.827781,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
4,AQU,2019-01-01,9,6,64,40.672553,-73.827762,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8


In [24]:
complete.dtypes

track_id               object
race_date              object
race_number             int64
program_number         object
trakus_index            int64
latitude              float64
longitude             float64
distance_id             int64
course_type            object
track_condition        object
run_up_distance         int64
race_type              object
purse                 float64
post_time              object
weight_carried          int64
jockey                 object
odds                    int64
position_at_finish      int64
dtype: object

## Create Features

### Create Distances

In [25]:
(complete
    .groupby(
        [
        'track_id',
        'race_date',
        'race_number',
        'program_number'
        ]
    )
    [[
    'track_id',
    'race_date',
    'race_number',
    'program_number'
    ]]
    .ngroup
)

<bound method GroupBy.ngroup of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000016FE2EA9300>>

In [26]:
complete = (    
    complete
    .sort_values(
        [
            'track_id',
            'race_date',
            'race_number',
            'program_number',
            'trakus_index'
        ]
    )
    .reset_index(drop=True)
)


In [27]:
#ddf = dask_geopandas.from_geopandas(complete_geo, npartitions=5)

In [28]:
# latlon_array = np.array(list(zip(complete['latitude'], complete['longitude'])))
# latlon_array_shift = npshift(latlon_array, -1)

In [29]:
complete['latlon'] = np.array(zip(complete['latitude'], complete['longitude']))
complete['latlon'].head()

0    (40.6694007433791, -73.8292054206592)
1    (40.6694051526183, -73.8292030605591)
2    (40.6694113624261, -73.8292000780746)
3    (40.6694207810975, -73.8291957066611)
4    (40.6694325233872, -73.8291904199145)
Name: latlon, dtype: object

In [30]:
complete['latlon_shift'] = (
    complete
    .groupby(
        [
            'track_id',
            'race_date',
            'race_number',
            'program_number'
        ]
    )
    ['latlon']
    .shift(-1)
)

In [31]:
with ThreadPoolExecutor(500) as executor:
    res = executor.map(vincenty_robust, complete['latlon'], complete['latlon_shift'])
complete['segment_distance'] = list(res)

### Create Angles

In [33]:
with ThreadPoolExecutor(500) as executor:
    res = executor.map(get_angle_robust, complete['latlon'], complete['latlon_shift'])
complete['segment_angle'] = list(res)



In [34]:
complete['segment_angle'] = complete['segment_angle'].apply(lambda x: np.nan if type(x) == np.ndarray else x)

### Create Speeds

In [36]:
complete['segment_speed'] = complete['segment_distance']*4

### Create Accelerations

In [47]:
complete['segment_acceleration'] = (
    complete
        .groupby(
            [
            'track_id',
            'race_date',
            'race_number',
            'program_number'
            ]
        )
        ['segment_speed']
        .diff()
)

### Save

In [49]:
complete.to_csv('../data/nyra_2019_complete_saad.csv', index = False)