In [1]:
import numpy as np
import pandas as pd
import os
import gzip
import dill
from os.path import exists as file_exists
import geopandas
from gpx_converter import Converter
from shapely.geometry import LineString, MultiPoint, Point
from shapely.ops import split

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, RobustScaler, OneHotEncoder

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS = STOP_WORDS.union({'ll', 've', 'pron'})

In [3]:
from sklearn import set_config
set_config(display='diagram')
pd.options.mode.chained_assignment = None  # default='warn'

# Motorcycle Road Recommendation Engine

## Data input

In [22]:
# calculates the sinuosity of each route from its gpx file of lat/lon coordinates
def calcluate_sinuosity(gpx_file_num):
    gpx_file = f'../gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_array = Converter(input_file=gpx_file).gpx_to_numpy_array()
        except Exception:
            return -1
        
        splits = 4
        subsets = np.array_split(gpx_array, splits)
        subset_sinuosities = []
        
        for subset in subsets:
            start_pt = subset[0]
            end_pt = subset[-1]
            route = LineString(subset)
            route_SL = LineString((start_pt, end_pt))
            route_sinuosity = route.length / route_SL.length
            subset_sinuosities.append(route_sinuosity)
        return sum(subset_sinuosities)/splits
    else:
        return -2

def get_route_coords(gpx_file_num):
    gpx_file = f'../gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_df = Converter(input_file=gpx_file).gpx_to_dataframe()
#             return gpx_df
            route_line = LineString(list(zip(gpx_df.longitude, gpx_df.latitude)))
            return route_line
        except Exception:
            return None

valid_states = ['Alabama', 'California', 'Georgia', 'Missouri', 'Illinois', 'Ohio',
       'Kentucky', 'Colorado', 'United States', 'Indiana', 'New York',
       'Vermont', 'Texas', 'Florida', 'Minnesota', 'Virginia',
       'Oklahoma', 'Arkansas', 'Maryland', 'West Virginia',
       'Michigan', 'North Carolina', 'Oregon', 'Pennsylvania',
       'Washington', 'New Jersey', 'Alaska',
       'South Carolina', 'Utah', 'New Hampshire', 'Iowa', 'Louisiana',
       'Mississippi', 'Wisconsin',
       'South Dakota', 'Wyoming', 'Massachusetts', 'New Mexico',
       'Montana', 'Idaho', 'Nevada', 'Arizona',
       'Kansas', 'Northeast', 'Southwest', 'Golf Coast', 'Southeast',
       'Tennessee', 'Nebraska', 'Delaware', 'Pacific Coast',
       'Appalachian Mountains', 'Maine', 'Rhode Island', 'Connecticut',
       'North Dakota', 'Hawaii']

from math import radians, cos, sin, asin, sqrt

# Calculates distance between 2 GPS coordinates
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [23]:
#read in data
df_raw = pd.read_csv('../data/route_data2.csv')#.drop('files',axis=1)
df_raw['state_prop_rank'] = df_raw.state_rank / df_raw.num_state_routes
df_raw['sinuosity'] = [calcluate_sinuosity(x) for x in df_raw['gpx_file_num']]
df = df_raw.loc[df_raw.sinuosity>=0].reset_index(drop=True)

In [25]:
route_coords = {
    'gpx_file_num': [x for x in df['gpx_file_num']],
    'geometry': [get_route_coords(x) for x in df['gpx_file_num']]
}

route_gdf = geopandas.GeoDataFrame(route_coords, crs='EPSG:2163')
# route_gdf['user_rating'] = df.user_rating
# route_gdf['route_name'] = df.name
# route_gdf['state'] = df.state

In [26]:
route_gdf

Unnamed: 0,gpx_file_num,geometry
0,6652,"LINESTRING (-86.330 34.681, -86.330 34.683, -8..."
1,34749,"LINESTRING (-86.329 34.664, -86.329 34.664, -8..."
2,34750,"LINESTRING (-86.632 32.840, -86.629 32.836, -8..."
3,34751,"LINESTRING (-87.895 34.879, -87.895 34.879, -8..."
4,34752,"LINESTRING (-86.826 33.521, -86.826 33.518, -8..."
...,...,...
2043,67128,"LINESTRING (-120.389 35.656, -120.394 35.656, ..."
2044,67147,"LINESTRING (-100.765 34.103, -100.766 34.113, ..."
2045,67232,"LINESTRING (-122.873 38.608, -122.874 38.609, ..."
2046,67234,"LINESTRING (-122.515 37.945, -122.514 37.945, ..."


In [27]:
#length of route in miles
distances = []
for line in route_gdf.geometry:
    numCoords = len(line.coords) - 1
    distance = 0
    for i in range(0, numCoords):
        point1 = line.coords[i]
        point2 = line.coords[i + 1]
        distance += haversine(point1[0], point1[1], point2[0], point2[1])
    distances.append(distance)
df['route_length'] = distances

In [28]:
#representative point for each route
rep_point = route_gdf.geometry.centroid
route_gdf['rep_point'] = rep_point
df['loc_lat'] = rep_point.y
df['loc_lon'] = rep_point.x

In [29]:
df.head()

Unnamed: 0,gpx_file_num,name,state,route_length,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,...,author,author_points,scenery_description,drive_enjoyment_description,tourism_description,files,state_prop_rank,sinuosity,loc_lat,loc_lon
0,6652,Paint Rock Valley Loop,United States,39.727942,0.0,0,0,0,3,2,...,admin,109,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,,0.625,1.429574,34.726485,-86.198479
1,34749,Paint Rock Valley,Alabama,23.627018,4.0,6,6,3,5,4,...,,0,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,,0.461538,1.137782,34.770065,-86.189154
2,34750,Central Alabama Country Tour - Highway 22,Alabama,87.583394,3.5,3,6,3,4,4,...,,0,Drive through a nice slide of Central Alabama ...,Smooth asphalt the entire way. Maybe one or tw...,Only an occasional store along the way. Roanok...,,0.538462,1.134919,32.94998,-85.993627
3,34751,County Road 14,Tennessee,18.31719,0.0,0,3,0,4,3,...,,0,County Rd 14 west of the Natchez Trace runs al...,"Road quality is fair, though I remember the Al...",Really not much to do off the bike ... Waterlo...,,0.888889,1.155947,34.961208,-88.036771
4,34752,"Alabama's ""Mini-Dragon"" to 29 Dreams",Alabama,27.065368,2.33,9,11,6,4,5,...,,0,On this route you will find several homes and ...,The road is nicely paved and marked and has nu...,Upon getting to the Leeds exit from interstate...,,0.576923,1.581907,33.529228,-86.651468


In [33]:
route_gdf = route_gdf.merge(df, on='gpx_file_num').drop('files',axis=1)

In [56]:
df.head()

Unnamed: 0,gpx_file_num,name,state,route_length,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,...,author,author_points,scenery_description,drive_enjoyment_description,tourism_description,files,state_prop_rank,sinuosity,loc_lat,loc_lon
0,6652,Paint Rock Valley Loop,United States,39.727942,0.0,0,0,0,3,2,...,admin,109,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,,0.625,1.429574,34.726485,-86.198479
1,34749,Paint Rock Valley,Alabama,23.627018,4.0,6,6,3,5,4,...,,0,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,,0.461538,1.137782,34.770065,-86.189154
2,34750,Central Alabama Country Tour - Highway 22,Alabama,87.583394,3.5,3,6,3,4,4,...,,0,Drive through a nice slide of Central Alabama ...,Smooth asphalt the entire way. Maybe one or tw...,Only an occasional store along the way. Roanok...,,0.538462,1.134919,32.94998,-85.993627
3,34751,County Road 14,Tennessee,18.31719,0.0,0,3,0,4,3,...,,0,County Rd 14 west of the Natchez Trace runs al...,"Road quality is fair, though I remember the Al...",Really not much to do off the bike ... Waterlo...,,0.888889,1.155947,34.961208,-88.036771
4,34752,"Alabama's ""Mini-Dragon"" to 29 Dreams",Alabama,27.065368,2.33,9,11,6,4,5,...,,0,On this route you will find several homes and ...,The road is nicely paved and marked and has nu...,Upon getting to the Leeds exit from interstate...,,0.576923,1.581907,33.529228,-86.651468


In [54]:
# Write files
with gzip.open('../data/route_gdf.pkl', 'wb') as f:
        dill.dump(route_gdf, f)
with gzip.open('../data/route_df.pkl', 'wb') as f:
        dill.dump(df, f)

# Read files
# with gzip.open('../data/route_gdf.pkl', 'rb') as f:
#     route_gdf = dill.load(f)
# with gzip.open('../data/route_df.pkl', 'rb') as f:
#     df = dill.load(f)

# Feature processing

In [108]:
# functions for transformers
def calc_row_sum(cols):
    return pd.DataFrame(cols.apply(lambda x: x.sum(), axis=1))

In [109]:
numeric_features = ['route_length','state_prop_rank']#,'scenery_rating','drive_enjoyment_rating','tourism_rating']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", RobustScaler())]
)

#tfidf vectorizer for each route's descriptions
description_transformer = TfidfVectorizer(stop_words=STOP_WORDS, 
                                   ngram_range=(1,2),
                                   min_df=.01
                                  )

# Route 'engagement' by MR site users: 
# num_user_reviews + num_users_rode + num_users_want2ride
engagement_transformer = FunctionTransformer(calc_row_sum)


preprocessor = ColumnTransformer([
    ('route_engagement',engagement_transformer, ['num_user_reviews','num_users_rode','num_users_want2ride']), 
    ('scenery', description_transformer, 'scenery_description'),
    ('drive_enjoyment', description_transformer, 'drive_enjoyment_description'),
    ('tourism', description_transformer, 'tourism_description'),
    ('state', OneHotEncoder(handle_unknown='ignore'), ['state']),
    ('numeric_features',numeric_transformer, numeric_features),
    ('locale','passthrough',['loc_lat','loc_lon'])
])

features = preprocessor.fit_transform(df)

In [110]:
engine_pipe = Pipeline([
    ('preprocessor', preprocessor),#preprocessor to deal to transform/generate each feature
    ('nn', NearestNeighbors(n_neighbors=10)),#Nearest Neighbors
])

engine_pipe.fit(df)

## Model Exploration

A few roads I've been on (to test functionality):

In [111]:
df.iloc[[37,53]]

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,state_rank,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length,loc_lat,loc_lon
37,34786,Pacific Coast Cruise; Hwy 1,California,4.76,51,95,64,5,4,5,2,131,This world-class scenic route takes you along ...,Overall the road quality is very good and the ...,You can plan a weeks vacation along this route...,0.015267,1.174615,104.417306,35.95458,-121.48141
53,34802,Napa to the Shores of Lake Berryessa,California,3.5,10,13,11,4,5,3,22,131,This is the hills of the Napa Valley. Windy ro...,While there are some parts in disrepair the ov...,There's really not much along the way but on S...,0.167939,1.385423,62.733844,38.64477,-122.28779


In [113]:
n=998

dists, indices = engine_pipe[1].kneighbors(features[n])
df.iloc[indices[0]]

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,state_rank,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length,loc_lat,loc_lon
998,35763,Allegan Dam - Lakeshore Drive Loop,Michigan,0.0,0,7,0,5,4,5,39,45,Terrific scenery going from hardwood forests t...,"Good twisties around the Allegan Dam, smoothin...","If you start off early, Village Inn in Allegan...",0.866667,7.613904,59.372764,42.54997,-86.02628
1483,36251,Richland to Shelbyville,Michigan,4.5,2,4,1,4,4,4,23,45,This is one of my go-to rides when I want to g...,"There are a couple tight 90 degree turns, and ...",Gas is available at the start of the route in ...,0.511111,1.311982,31.917887,42.63854,-85.45884
161,34910,South Bend to Four Winds Casino Run,Michigan,4.0,1,4,1,4,4,4,30,45,A lot of farm country including some Amish Ter...,"Most are excellent, but you may encounter some...",Four Winds Casino is actually over the border ...,0.666667,1.247998,37.186444,41.65797,-86.56587
791,35551,"Dexter Trail - Gregory to Mason, MI",Michigan,4.0,1,5,1,3,5,2,31,45,Not really any special scenery here but this r...,"Curves, low traffic and very few intersection ...",Not many amenities on the route itself. In fac...,0.688889,1.20587,25.786966,42.50709,-84.25737
1562,36330,Pine River Road Southwest of Midland,Michigan,4.0,3,0,4,4,3,1,17,45,This route follows the Pine River. While on th...,Lots of twisties the whole way. Towards the en...,"No tourist attractions, and only one gas stati...",0.377778,1.118409,24.912892,43.53976,-84.431
156,34905,US Hwy 6,Indiana,3.0,5,0,1,4,3,2,41,53,Completely rural countryside with some residen...,Two-lane highway traveling through the country...,"Few small towns offer gas, a convenience store...",0.773585,1.072387,38.983023,41.51747,-86.74991
238,34989,Michigan's West Coast Winery Tour,Michigan,4.0,1,4,0,4,3,4,33,45,Routed along the old connector between Detroit...,"Hey, it's Michigan. If you want twisties, go t...","It's a short run with plenty of ""bergs"" to sto...",0.733333,1.309658,97.258111,42.1403,-86.26213
1418,36186,Route 10 to Kerstings Harley and Motorcycle Mu...,Indiana,3.5,4,1,2,3,3,3,16,53,"Riding west on 10 you will have some curves, s...",Mostly straight yet has some curves along the ...,"Bass lake, Kerstings Motorcycle Museum, small...",0.301887,1.145102,50.72272,41.22599,-86.44228
165,34914,Stagecoach Road,Indiana,1.5,2,4,1,4,3,1,42,53,A short jaunt through the heart of the Indiana...,This road is relatively flat but has many twis...,"This road is only a couple miles long, so ther...",0.792453,1.225171,2.491881,41.60762,-87.20306
159,34908,"Goldring Road, La Porte, IN",Indiana,3.67,7,1,0,4,4,1,21,53,Scenery is limited to the wooded area the road...,"I call this road the ""Dragon"" of Northwest Ind...",Country road... nothing to offer other than a ...,0.396226,1.179809,8.286688,41.61037,-86.7972
