In [1]:
import numpy as np
import pandas as pd
from os.path import exists as file_exists
import geopandas
from gpx_converter import Converter
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV, LinearRegression, SGDRegressor, Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, RobustScaler, OneHotEncoder

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS = STOP_WORDS.union({'ll', 've', 'pron'})

In [28]:
from sklearn import set_config
set_config(display='diagram')
pd.options.mode.chained_assignment = None  # default='warn'

# Motorcycle Road Recommendation Engine

In [4]:
# calculates the sinuosity of each route from its gpx file of lat/lon coordinates
def calcluate_sinuosity(gpx_file_num):
    gpx_file = f'gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_array = Converter(input_file=gpx_file).gpx_to_numpy_array()
        except Exception:
            return -1
        
        splits = 4
        subsets = np.array_split(gpx_array, splits)
        subset_sinuosities = []
        
        for subset in subsets:
            start_pt = subset[0]
            end_pt = subset[-1]
            route = LineString(subset)
            route_SL = LineString((start_pt, end_pt))
            route_sinuosity = route.length / route_SL.length
            subset_sinuosities.append(route_sinuosity)
        return sum(subset_sinuosities)/splits
    else:
        return -2

def get_route_coords(gpx_file_num):
    gpx_file = f'gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_df = Converter(input_file=gpx_file).gpx_to_dataframe()
            route_line = LineString(list(zip(gpx_df.longitude, gpx_df.latitude)))
            return route_line
        except Exception:
            return None

valid_states = ['Alabama', 'California', 'Georgia', 'Missouri', 'Illinois', 'Ohio',
       'Kentucky', 'Colorado', 'United States', 'Indiana', 'New York',
       'Vermont', 'Texas', 'Florida', 'Minnesota', 'Virginia',
       'Oklahoma', 'Arkansas', 'Maryland', 'West Virginia',
       'Michigan', 'North Carolina', 'Oregon', 'Pennsylvania',
       'Washington', 'New Jersey', 'Alaska',
       'South Carolina', 'Utah', 'New Hampshire', 'Iowa', 'Louisiana',
       'Mississippi', 'Wisconsin',
       'South Dakota', 'Wyoming', 'Massachusetts', 'New Mexico',
       'Montana', 'Idaho', 'Nevada', 'Arizona',
       'Kansas', 'Northeast', 'Southwest', 'Golf Coast', 'Southeast',
       'Tennessee', 'Nebraska', 'Delaware', 'Pacific Coast',
       'Appalachian Mountains', 'Maine', 'Rhode Island', 'Connecticut',
       'North Dakota', 'Hawaii']

In [38]:
#read in data
df_raw = pd.read_csv('route_data_RAW.csv')
df_raw['state_prop_rank'] = df_raw.state_rank / df_raw.num_state_routes
df_raw['sinuosity'] = [calcluate_sinuosity(x) for x in df_raw['gpx_file_num']]
df = df_raw.loc[df_raw.sinuosity>=0].reset_index(drop=True)

In [95]:
route_coords = {
    'gpx_file_num': [x for x in df['gpx_file_num']],
    'geometry': [get_route_coords(x) for x in df['gpx_file_num']]
}

route_gdf = geopandas.GeoDataFrame(route_coords, crs='EPSG:2163')
df['route_length'] = route_gdf.geometry.length

In [106]:
rough_locale = route_gdf.geometry.representative_point()
df['loc_lat'] = rough_locale.x
df['loc_lon'] = rough_locale.y

In [107]:
df.head()

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,...,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length,rough_locale,loc_lat,loc_lon
0,6652,Paint Rock Valley Loop,United States,0.0,0,0,0,3,2,1,...,24,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,0.666667,1.429574,0.910715,POINT (-86.246 34.752),-86.24631,34.75214
1,34749,Paint Rock Valley,Alabama,4.0,6,6,3,5,4,2,...,26,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,0.461538,1.137782,0.644267,POINT (-86.243 34.764),-86.24329,34.76445
2,34750,Central Alabama Country Tour - Highway 22,Alabama,3.5,3,6,3,4,4,2,...,26,Drive through a nice slide of Central Alabama ...,Smooth asphalt the entire way. Maybe one or tw...,Only an occasional store along the way. Roanok...,0.5,1.134919,1.485195,POINT (-85.974 32.931),-85.97433,32.93111
3,34751,County Road 14,Tennessee,0.0,0,3,0,4,3,2,...,72,County Rd 14 west of the Natchez Trace runs al...,"Road quality is fair, though I remember the Al...",Really not much to do off the bike ... Waterlo...,1.0,1.155947,0.402356,POINT (-88.062 34.943),-88.06226,34.94301
4,34752,"Alabama's ""Mini-Dragon"" to 29 Dreams",Alabama,2.33,9,11,6,4,5,2,...,26,On this route you will find several homes and ...,The road is nicely paved and marked and has nu...,Upon getting to the Leeds exit from interstate...,0.5,1.581907,0.466264,POINT (-86.651 33.545),-86.6507,33.54515


### Feature processing

In [42]:
# functions for transformers
def calc_row_sum(cols):
    return pd.DataFrame(cols.apply(lambda x: x.sum(), axis=1))

In [114]:
numeric_features = ['route_length','state_prop_rank']#,'scenery_rating','drive_enjoyment_rating','tourism_rating']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", RobustScaler())]
)

#tfidf vectorizer for each route's descriptions
description_transformer = TfidfVectorizer(stop_words=STOP_WORDS, 
                                   ngram_range=(1,2),
                                   min_df=.01
                                  )

# Route 'engagement' by MR site users: 
# num_user_reviews + num_users_rode + num_users_want2ride
engagement_transformer = FunctionTransformer(calc_row_sum)

state_ranker = FunctionTransformer(calc_state_rank)


preprocessor = ColumnTransformer([
    ('route_engagement',engagement_transformer, ['num_user_reviews','num_users_rode','num_users_want2ride']), 
    ('scenery', description_transformer, 'scenery_description'),
    ('drive_enjoyment', description_transformer, 'drive_enjoyment_description'),
    ('tourism', description_transformer, 'tourism_description'),
    ('state', OneHotEncoder(handle_unknown='ignore'), ['state']),
    ('state_rank',state_ranker,['state_rank','num_state_routes']),
    ('numeric_features',numeric_transformer, numeric_features),
    ('locale','passthrough',['loc_lat','loc_lon'])
])

features = preprocessor.fit_transform(df)

In [115]:
engine_pipe = Pipeline([
    ('preprocessor', preprocessor),#preprocessor to deal to transform/generate each feature
    ('nn', NearestNeighbors(n_neighbors=10)),#Nearest Neighbors
])

engine_pipe.fit(df)

## Model Exploration

A few roads I've been on (to test functionality):

In [88]:
df.iloc[[37,53]]

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,state_rank,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length
37,34786,Pacific Coast Cruise; Hwy 1,California,4.76,51,95,64,5,4,5,2,131,This world-class scenic route takes you along ...,Overall the road quality is very good and the ...,You can plan a weeks vacation along this route...,0.015267,1.174615,1.927261
53,34802,Napa to the Shores of Lake Berryessa,California,3.5,10,13,11,4,5,3,22,131,This is the hills of the Napa Valley. Windy ro...,While there are some parts in disrepair the ov...,There's really not much along the way but on S...,0.167939,1.385423,1.13445


In [117]:
n=37

dists, indices = engine_pipe[1].kneighbors(features[n])
df.iloc[indices[0]]

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,...,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length,rough_locale,loc_lat,loc_lon
37,34786,Pacific Coast Cruise; Hwy 1,California,4.76,51,95,64,5,4,5,...,131,This world-class scenic route takes you along ...,Overall the road quality is very good and the ...,You can plan a weeks vacation along this route...,0.015267,1.174615,1.927261,POINT (-121.481 35.955),-121.48141,35.95458
41,34790,Twisty Road - Next 140 miles!!! (California Ro...,California,4.7,61,99,39,4,5,1,...,131,some pretty decent view's from higher elevatio...,most likely the craziest elevation changes you...,i seem to remember only 2 gas stations ( neith...,0.007634,1.189029,2.349427,POINT (-123.225 40.395),-123.22469,40.39475
591,35348,Chief Joseph Scenic Highway-Bighorn Mountains,Wyoming,4.62,61,62,76,5,5,2,...,22,This is a great full day ride when combined wi...,Endless switch-backs Vertical ascents to 10'00...,"Lots of pullouts on these roads. However, ther...",0.090909,1.29294,3.353796,POINT (-108.499 44.797),-108.49898,44.79688
279,35032,Going-To-The-Sun Road,Montana,4.62,55,64,73,5,3,4,...,29,"For mountain climbers, reaching Mt Everest is ...",Much of the road quality depends on the crowds...,On this trip you'll want to make sure to bring...,0.068966,1.319421,0.90668,POINT (-113.724 48.698),-113.72417,48.69801
73,34822,The Walden Loop (courtesy of Greeley HD&Yamaha),Colorado,4.74,47,126,56,5,4,5,...,77,Absolutely breathtaking is the only way to des...,The roads are great with few rough areas. May ...,Many stops and pull offs are provided so you c...,0.025974,1.886016,3.750915,POINT (-105.756 40.444),-105.75628,40.44398
386,35140,Talimena National Scenic Byway,Oklahoma,4.38,61,63,74,5,4,4,...,40,"The Talimena National Scenic Byway or, for sho...","As mentioned above, the road is filled with lo...","Access can be at Talihina, OK on the west end ...",0.025,1.1782,1.005985,POINT (-94.639 34.711),-94.6389,34.71128
17,34766,Coronado Trail,Arizona,4.78,63,77,42,5,5,3,...,45,This road brings you to a whole new world in A...,A true 5-star road for motorcycles. They say t...,There is a lot of camping and hiking trails al...,0.022222,1.594885,1.71927,POINT (-109.309 33.537),-109.30876,33.53686
79,34828,Peak to Peak Highway,Colorado,4.36,43,87,54,5,3,4,...,77,Depending on which canyon you start with you w...,The roads will have some sand in the early spr...,"Several stops along the way to take pictures, ...",0.038961,1.624538,0.94304,POINT (-105.503 40.076),-105.50311,40.07564
433,35187,Central Hills Loop (includes the Needles Highway),South Dakota,4.77,62,88,87,5,5,4,...,17,"From the quaint, historic town of Keystone, pa...",This route is a motorcyclists dream. Incredibl...,"Check out the shops in Keystone, visit Mount R...",0.058824,1.892865,0.915218,POINT (-103.465 43.818),-103.46462,43.81794
613,35371,"""The Snake"" - Hwy 421 & 34",Tennessee,4.89,35,111,48,4,5,3,...,72,"This route takes you from Mountain City, TN to...","The Snake is said to have ""489 curves, 3 mount...",Mountain City and Holston Valley have everythi...,0.069444,1.402735,0.388273,POINT (-81.937 36.524),-81.93663,36.52356
