In [1]:
import numpy as np
import pandas as pd
from os.path import exists as file_exists
import geopandas
from gpx_converter import Converter
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV, LinearRegression, SGDRegressor, Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, RobustScaler, OneHotEncoder

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS = STOP_WORDS.union({'ll', 've', 'pron'})

In [28]:
from sklearn import set_config
set_config(display='diagram')
pd.options.mode.chained_assignment = None  # default='warn'

# Motorcycle Road Recommendation Engine

In [4]:
# calculates the sinuosity of each route from its gpx file of lat/lon coordinates
def calcluate_sinuosity(gpx_file_num):
    gpx_file = f'gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_array = Converter(input_file=gpx_file).gpx_to_numpy_array()
        except Exception:
            return -1
        
        splits = 4
        subsets = np.array_split(gpx_array, splits)
        subset_sinuosities = []
        
        for subset in subsets:
            start_pt = subset[0]
            end_pt = subset[-1]
            route = LineString(subset)
            route_SL = LineString((start_pt, end_pt))
            route_sinuosity = route.length / route_SL.length
            subset_sinuosities.append(route_sinuosity)
        return sum(subset_sinuosities)/splits
    else:
        return -2

def get_route_coords(gpx_file_num):
    gpx_file = f'gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_df = Converter(input_file=gpx_file).gpx_to_dataframe()
            route_line = LineString(list(zip(gpx_df.longitude, gpx_df.latitude)))
            return route_line
        except Exception:
            return None

valid_states = ['Alabama', 'California', 'Georgia', 'Missouri', 'Illinois', 'Ohio',
       'Kentucky', 'Colorado', 'United States', 'Indiana', 'New York',
       'Vermont', 'Texas', 'Florida', 'Minnesota', 'Virginia',
       'Oklahoma', 'Arkansas', 'Maryland', 'West Virginia',
       'Michigan', 'North Carolina', 'Oregon', 'Pennsylvania',
       'Washington', 'New Jersey', 'Alaska',
       'South Carolina', 'Utah', 'New Hampshire', 'Iowa', 'Louisiana',
       'Mississippi', 'Wisconsin',
       'South Dakota', 'Wyoming', 'Massachusetts', 'New Mexico',
       'Montana', 'Idaho', 'Nevada', 'Arizona',
       'Kansas', 'Northeast', 'Southwest', 'Golf Coast', 'Southeast',
       'Tennessee', 'Nebraska', 'Delaware', 'Pacific Coast',
       'Appalachian Mountains', 'Maine', 'Rhode Island', 'Connecticut',
       'North Dakota', 'Hawaii']

In [38]:
#read in data
df_raw = pd.read_csv('route_data_RAW.csv')
df_raw['state_prop_rank'] = df_raw.state_rank / df_raw.num_state_routes
df_raw['sinuosity'] = [calcluate_sinuosity(x) for x in df_raw['gpx_file_num']]
df = df_raw.loc[df_raw.sinuosity>=0].reset_index(drop=True)

In [95]:
route_coords = {
    'gpx_file_num': [x for x in df['gpx_file_num']],
    'geometry': [get_route_coords(x) for x in df['gpx_file_num']]
}

route_gdf = geopandas.GeoDataFrame(route_coords, crs='EPSG:2163')


In [106]:
#length of route in CRS units
df['route_length'] = route_gdf.geometry.length

#representative point for each route
rough_locale = route_gdf.geometry.representative_point()
df['loc_lat'] = rough_locale.x
df['loc_lon'] = rough_locale.y

In [107]:
df.head()

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,...,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length,rough_locale,loc_lat,loc_lon
0,6652,Paint Rock Valley Loop,United States,0.0,0,0,0,3,2,1,...,24,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,0.666667,1.429574,0.910715,POINT (-86.246 34.752),-86.24631,34.75214
1,34749,Paint Rock Valley,Alabama,4.0,6,6,3,5,4,2,...,26,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,0.461538,1.137782,0.644267,POINT (-86.243 34.764),-86.24329,34.76445
2,34750,Central Alabama Country Tour - Highway 22,Alabama,3.5,3,6,3,4,4,2,...,26,Drive through a nice slide of Central Alabama ...,Smooth asphalt the entire way. Maybe one or tw...,Only an occasional store along the way. Roanok...,0.5,1.134919,1.485195,POINT (-85.974 32.931),-85.97433,32.93111
3,34751,County Road 14,Tennessee,0.0,0,3,0,4,3,2,...,72,County Rd 14 west of the Natchez Trace runs al...,"Road quality is fair, though I remember the Al...",Really not much to do off the bike ... Waterlo...,1.0,1.155947,0.402356,POINT (-88.062 34.943),-88.06226,34.94301
4,34752,"Alabama's ""Mini-Dragon"" to 29 Dreams",Alabama,2.33,9,11,6,4,5,2,...,26,On this route you will find several homes and ...,The road is nicely paved and marked and has nu...,Upon getting to the Leeds exit from interstate...,0.5,1.581907,0.466264,POINT (-86.651 33.545),-86.6507,33.54515


### Feature processing

In [42]:
# functions for transformers
def calc_row_sum(cols):
    return pd.DataFrame(cols.apply(lambda x: x.sum(), axis=1))

In [114]:
numeric_features = ['route_length','state_prop_rank']#,'scenery_rating','drive_enjoyment_rating','tourism_rating']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", RobustScaler())]
)

#tfidf vectorizer for each route's descriptions
description_transformer = TfidfVectorizer(stop_words=STOP_WORDS, 
                                   ngram_range=(1,2),
                                   min_df=.01
                                  )

# Route 'engagement' by MR site users: 
# num_user_reviews + num_users_rode + num_users_want2ride
engagement_transformer = FunctionTransformer(calc_row_sum)

state_ranker = FunctionTransformer(calc_state_rank)


preprocessor = ColumnTransformer([
    ('route_engagement',engagement_transformer, ['num_user_reviews','num_users_rode','num_users_want2ride']), 
    ('scenery', description_transformer, 'scenery_description'),
    ('drive_enjoyment', description_transformer, 'drive_enjoyment_description'),
    ('tourism', description_transformer, 'tourism_description'),
    ('state', OneHotEncoder(handle_unknown='ignore'), ['state']),
    ('state_rank',state_ranker,['state_rank','num_state_routes']),
    ('numeric_features',numeric_transformer, numeric_features),
    ('locale','passthrough',['loc_lat','loc_lon'])
])

features = preprocessor.fit_transform(df)

In [115]:
engine_pipe = Pipeline([
    ('preprocessor', preprocessor),#preprocessor to deal to transform/generate each feature
    ('nn', NearestNeighbors(n_neighbors=10)),#Nearest Neighbors
])

engine_pipe.fit(df)

## Model Exploration

A few roads I've been on (to test functionality):

In [88]:
df.iloc[[37,53]]

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,state_rank,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length
37,34786,Pacific Coast Cruise; Hwy 1,California,4.76,51,95,64,5,4,5,2,131,This world-class scenic route takes you along ...,Overall the road quality is very good and the ...,You can plan a weeks vacation along this route...,0.015267,1.174615,1.927261
53,34802,Napa to the Shores of Lake Berryessa,California,3.5,10,13,11,4,5,3,22,131,This is the hills of the Napa Valley. Windy ro...,While there are some parts in disrepair the ov...,There's really not much along the way but on S...,0.167939,1.385423,1.13445


In [118]:
n=53

dists, indices = engine_pipe[1].kneighbors(features[n])
df.iloc[indices[0]]

Unnamed: 0,gpx_file_num,name,state,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,tourism_rating,...,num_state_routes,scenery_description,drive_enjoyment_description,tourism_description,state_prop_rank,sinuosity,route_length,rough_locale,loc_lat,loc_lon
53,34802,Napa to the Shores of Lake Berryessa,California,3.5,10,13,11,4,5,3,...,131,This is the hills of the Napa Valley. Windy ro...,While there are some parts in disrepair the ov...,There's really not much along the way but on S...,0.167939,1.385423,1.13445,POINT (-122.288 38.645),-122.28779,38.64477
298,35052,"Old Hwy 50 - ""The Loneliest Road In America""",Nevada,4.0,14,10,10,4,4,3,...,14,"On this route, you will see the Nevada desert ...",Nevada has always been known for it awesome ro...,The biggest amenity of Old Highway 50 is Middl...,0.214286,1.044574,1.889525,POINT (-118.866 39.486),-118.86638,39.48558
56,34805,The Golden 49,California,4.36,14,14,9,5,0,4,...,131,Beautiful scenery of the Sierras. Mountain ove...,"Great road all-around. No pots, dips, or heavy...",Coulterville has small country stores and a fe...,0.122137,1.351761,0.34279,POINT (-120.133 37.603),-120.13261,37.6032
43,34792,"Highway 198, Sequoia National Park",California,4.63,9,15,9,5,5,4,...,131,This place could define the word scenery in a ...,Imagine the perfect layout with tight curves a...,There's a lack of gas stations and fast food j...,0.145038,2.201111,0.78499,POINT (-118.805 36.605),-118.80527,36.60472
297,35051,Reno to Lake Tahoe Loop,Nevada,4.38,8,23,7,5,5,4,...,14,"From peaceful cattle grazing meadows, with no ...",The roads are great. Newly paved 2 lane with t...,"Fabulous camp grounds, national forests restau...",0.285714,1.519524,2.106253,POINT (-119.977 39.650),-119.97727,39.65017
623,35381,Caliente to Kernville on County Rd 483,California,5.0,2,23,6,5,4,4,...,131,California rolling hills with two beautiful fa...,"This road has everything hills, down grades, t...",You will find great amenities at Lake Santa Is...,0.48855,1.331647,0.710544,POINT (-118.525 35.490),-118.52456,35.48954
801,35563,The Famous Rock Store Run (Mulholland Highway),California,4.0,6,13,14,4,5,5,...,131,The scenery on this ride is probably pretty ni...,The road is not perfect but the road twisty an...,"The only ""roadside amenity"" on this run is the...",0.29771,1.822287,0.040352,POINT (-118.796 34.100),-118.79557,34.0999
66,34815,Topanga Canyon Loop,California,3.0,8,19,6,0,5,4,...,131,If you want to see some of the only remaining ...,The roads going through these canyons are a bl...,There really isn't that much to do along this ...,0.496183,1.560881,0.392014,POINT (-118.596 34.081),-118.59593,34.08069
394,35148,The Estacada to Detroit Lake highway,Oregon,4.4,17,8,11,5,4,1,...,36,This route travel entirely through the Mt. Hoo...,Lots of twists and turns but portions of the r...,Not much till you get to Detroit Lake ...just ...,0.055556,1.309291,1.22201,POINT (-122.035 45.028),-122.03502,45.02821
1751,36589,Rim of the World Highway - CA SR 18 & 330,California,4.67,3,20,9,5,4,4,...,131,Incredible mountain vistas visible from this r...,"SR 18 is a well traveled road and in general, ...",Outstanding views with numerous pull-overs to ...,0.442748,2.135684,0.585945,POINT (-117.212 34.229),-117.21249,34.22913
