In [2]:
import numpy as np
import pandas as pd
import os
import gzip
import dill
from os.path import exists as file_exists
import geopandas
from gpx_converter import Converter
from shapely.geometry import LineString, MultiPoint, Point
from shapely.ops import split

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, RobustScaler, OneHotEncoder

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS = STOP_WORDS.union({'ll', 've', 'pron'})

In [4]:
from sklearn import set_config
set_config(display='diagram')
pd.options.mode.chained_assignment = None  # default='warn'

# Motorcycle Road Recommendation Engine

## Data input

In [22]:
# calculates the sinuosity of each route from its gpx file of lat/lon coordinates
def calcluate_sinuosity(gpx_file_num):
    gpx_file = f'../gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_array = Converter(input_file=gpx_file).gpx_to_numpy_array()
        except Exception:
            return -1
        
        splits = 4
        subsets = np.array_split(gpx_array, splits)
        subset_sinuosities = []
        
        for subset in subsets:
            start_pt = subset[0]
            end_pt = subset[-1]
            route = LineString(subset)
            route_SL = LineString((start_pt, end_pt))
            route_sinuosity = route.length / route_SL.length
            subset_sinuosities.append(route_sinuosity)
        return sum(subset_sinuosities)/splits
    else:
        return -2

def get_route_coords(gpx_file_num):
    gpx_file = f'../gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_df = Converter(input_file=gpx_file).gpx_to_dataframe()
#             return gpx_df
            route_line = LineString(list(zip(gpx_df.longitude, gpx_df.latitude)))
            return route_line
        except Exception:
            return None

valid_states = ['Alabama', 'California', 'Georgia', 'Missouri', 'Illinois', 'Ohio',
       'Kentucky', 'Colorado', 'United States', 'Indiana', 'New York',
       'Vermont', 'Texas', 'Florida', 'Minnesota', 'Virginia',
       'Oklahoma', 'Arkansas', 'Maryland', 'West Virginia',
       'Michigan', 'North Carolina', 'Oregon', 'Pennsylvania',
       'Washington', 'New Jersey', 'Alaska',
       'South Carolina', 'Utah', 'New Hampshire', 'Iowa', 'Louisiana',
       'Mississippi', 'Wisconsin',
       'South Dakota', 'Wyoming', 'Massachusetts', 'New Mexico',
       'Montana', 'Idaho', 'Nevada', 'Arizona',
       'Kansas', 'Northeast', 'Southwest', 'Golf Coast', 'Southeast',
       'Tennessee', 'Nebraska', 'Delaware', 'Pacific Coast',
       'Appalachian Mountains', 'Maine', 'Rhode Island', 'Connecticut',
       'North Dakota', 'Hawaii']

from math import radians, cos, sin, asin, sqrt

# Calculates distance between 2 GPS coordinates
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [23]:
#read in data
df_raw = pd.read_csv('../data/route_data2.csv')#.drop('files',axis=1)
df_raw['state_prop_rank'] = df_raw.state_rank / df_raw.num_state_routes
df_raw['sinuosity'] = [calcluate_sinuosity(x) for x in df_raw['gpx_file_num']]
df = df_raw.loc[df_raw.sinuosity>=0].reset_index(drop=True)

In [25]:
route_coords = {
    'gpx_file_num': [x for x in df['gpx_file_num']],
    'geometry': [get_route_coords(x) for x in df['gpx_file_num']]
}

route_gdf = geopandas.GeoDataFrame(route_coords, crs='EPSG:2163')
# route_gdf['user_rating'] = df.user_rating
# route_gdf['route_name'] = df.name
# route_gdf['state'] = df.state

In [26]:
route_gdf

Unnamed: 0,gpx_file_num,geometry
0,6652,"LINESTRING (-86.330 34.681, -86.330 34.683, -8..."
1,34749,"LINESTRING (-86.329 34.664, -86.329 34.664, -8..."
2,34750,"LINESTRING (-86.632 32.840, -86.629 32.836, -8..."
3,34751,"LINESTRING (-87.895 34.879, -87.895 34.879, -8..."
4,34752,"LINESTRING (-86.826 33.521, -86.826 33.518, -8..."
...,...,...
2043,67128,"LINESTRING (-120.389 35.656, -120.394 35.656, ..."
2044,67147,"LINESTRING (-100.765 34.103, -100.766 34.113, ..."
2045,67232,"LINESTRING (-122.873 38.608, -122.874 38.609, ..."
2046,67234,"LINESTRING (-122.515 37.945, -122.514 37.945, ..."


In [27]:
#length of route in miles
distances = []
for line in route_gdf.geometry:
    numCoords = len(line.coords) - 1
    distance = 0
    for i in range(0, numCoords):
        point1 = line.coords[i]
        point2 = line.coords[i + 1]
        distance += haversine(point1[0], point1[1], point2[0], point2[1])
    distances.append(distance)
df['route_length'] = distances

In [28]:
#representative point for each route
rep_point = route_gdf.geometry.centroid
route_gdf['rep_point'] = rep_point
df['loc_lat'] = rep_point.y
df['loc_lon'] = rep_point.x

In [29]:
df.head()

Unnamed: 0,gpx_file_num,name,state,route_length,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,...,author,author_points,scenery_description,drive_enjoyment_description,tourism_description,files,state_prop_rank,sinuosity,loc_lat,loc_lon
0,6652,Paint Rock Valley Loop,United States,39.727942,0.0,0,0,0,3,2,...,admin,109,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,,0.625,1.429574,34.726485,-86.198479
1,34749,Paint Rock Valley,Alabama,23.627018,4.0,6,6,3,5,4,...,,0,The valley road winds beside the Paint Rock Ri...,Basically good asphalt...occasional fault here...,Not much to do...just lots of outdoors to see....,,0.461538,1.137782,34.770065,-86.189154
2,34750,Central Alabama Country Tour - Highway 22,Alabama,87.583394,3.5,3,6,3,4,4,...,,0,Drive through a nice slide of Central Alabama ...,Smooth asphalt the entire way. Maybe one or tw...,Only an occasional store along the way. Roanok...,,0.538462,1.134919,32.94998,-85.993627
3,34751,County Road 14,Tennessee,18.31719,0.0,0,3,0,4,3,...,,0,County Rd 14 west of the Natchez Trace runs al...,"Road quality is fair, though I remember the Al...",Really not much to do off the bike ... Waterlo...,,0.888889,1.155947,34.961208,-88.036771
4,34752,"Alabama's ""Mini-Dragon"" to 29 Dreams",Alabama,27.065368,2.33,9,11,6,4,5,...,,0,On this route you will find several homes and ...,The road is nicely paved and marked and has nu...,Upon getting to the Leeds exit from interstate...,,0.576923,1.581907,33.529228,-86.651468


In [33]:
route_gdf = route_gdf.merge(df, on='gpx_file_num').drop('files',axis=1)

In [59]:
route_gdf.route_length.min()

0.8788460902607798

In [5]:
# # Write files
# with gzip.open('../data/route_gdf.pkl', 'wb') as f:
#         dill.dump(route_gdf, f)
# with gzip.open('../data/route_df.pkl', 'wb') as f:
#         dill.dump(df, f)

# Read files
with gzip.open('../data/route_gdf.pkl', 'rb') as f:
    route_gdf = dill.load(f)
with gzip.open('../data/route_df.pkl', 'rb') as f:
    df = dill.load(f)

# Feature processing

In [6]:
# functions for transformers
def calc_row_sum(cols):
    return pd.DataFrame(cols.apply(lambda x: x.sum(), axis=1))

In [20]:
numeric_features = ['route_length','state_prop_rank']#,'scenery_rating','drive_enjoyment_rating','tourism_rating']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", RobustScaler())]
)

#tfidf vectorizer for each route's descriptions
description_transformer = TfidfVectorizer(stop_words=STOP_WORDS, 
#                                    ngram_range=(1,2),
                                   min_df=.01
                                  )

# Route 'engagement' by MR site users: 
# num_user_reviews + num_users_rode + num_users_want2ride
engagement_transformer = FunctionTransformer(calc_row_sum)


preprocessor = ColumnTransformer([
    ('route_engagement',engagement_transformer, ['num_user_reviews','num_users_rode','num_users_want2ride']), 
#     ('scenery', description_transformer, 'scenery_description'),
#     ('drive_enjoyment', description_transformer, 'drive_enjoyment_description'),
#     ('tourism', description_transformer, 'tourism_description'),
    ('state', OneHotEncoder(handle_unknown='ignore'), ['state']),
    ('numeric_features',numeric_transformer, numeric_features),
#     ('locale','passthrough',['loc_lat','loc_lon'])
])

features = preprocessor.fit_transform(df)

In [21]:
features

<2048x73 sparse matrix of type '<class 'numpy.float64'>'
	with 8040 stored elements in Compressed Sparse Row format>

In [22]:
engine_pipe = Pipeline([
    ('preprocessor', preprocessor),#preprocessor to deal to transform/generate each feature
    ('nn', NearestNeighbors(n_neighbors=20)),#Nearest Neighbors
])

engine_pipe.fit(df)

## Model Exploration

A few roads I've been on (to test functionality):

In [23]:
df.iloc[[37,53]]
df[df.gpx_file_num == 35762]

Unnamed: 0,gpx_file_num,name,state,route_length,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,...,author,author_points,scenery_description,drive_enjoyment_description,tourism_description,files,state_prop_rank,sinuosity,loc_lat,loc_lon
989,35762,The Panoramic Highway,California,7.916804,4.5,2,18,6,5,3,...,silverwinggli,5,If you like twistys and Amazing mountain views...,Road is generally good. It is a narrow mounta...,"There is a small town along the way, but for a...",,0.481752,1.486539,37.901778,-122.595512


In [24]:
n=989

dists, indices = engine_pipe[1].kneighbors(features[n])
df.iloc[indices[0]]

Unnamed: 0,gpx_file_num,name,state,route_length,user_rating,num_user_reviews,num_users_rode,num_users_want2ride,scenery_rating,drive_enjoyment_rating,...,author,author_points,scenery_description,drive_enjoyment_description,tourism_description,files,state_prop_rank,sinuosity,loc_lat,loc_lon
989,35762,The Panoramic Highway,California,7.916804,4.5,2,18,6,5,3,...,silverwinggli,5,If you like twistys and Amazing mountain views...,Road is generally good. It is a narrow mounta...,"There is a small town along the way, but for a...",,0.481752,1.486539,37.901778,-122.595512
1121,35894,Las Flores Canyon Road to Piuma Road,California,14.987478,4.5,3,17,6,4,4,...,adhersin,11,Gorgeous view on the Valley and the ocean. You...,Very few bumps/holes. And the road surfaces of...,Don't expect to find any stores on the way. Yo...,,0.510949,2.235005,34.082005,-118.678348
869,35638,Through Death Valley on Badwater Rd,California,60.681495,4.8,5,15,6,4,4,...,steben53,15,I need to give this 4 stars because it's so mu...,The road starts out with some straight section...,For this stretch of road amenities are non-exi...,,0.291971,1.209418,36.08665,-116.661496
1318,36094,The Cuyahoga Valley National Park Run,Ohio,15.484292,3.2,5,15,6,5,4,...,Voodoo Daddy,6,The route begins riding down into a somewhat i...,The roads are mostly newly paved 2 lane street...,The ride is about 26 miles long but can easily...,,0.39,1.21788,41.243634,-81.572821
672,35432,The St Croix Trail,Minnesota,7.525871,3.67,6,10,10,5,4,...,MCR Contributor,1038,"You can see the St. Croix river bluff, some pr...","The route is fun, full of S curves going up an...","Stillwater is a hot spot for shopping, restaur...",,0.357143,1.095682,44.903679,-92.791397
757,35522,Kickapoo river valley,Wisconsin,18.489999,4.0,6,19,1,4,4,...,jhlemay,5,This route crosses the Kickapoo River at least...,"The main road is great, well paved and maintai...",Highlight of the road is the visitor center wh...,,0.313725,1.229419,43.765554,-90.552692
745,35510,The Highland Scenic Highway (SR 150),West Virginia,23.153531,4.75,4,15,7,5,5,...,Pouge,62,A beautiful ride through the highlands of W.Va...,Very smooth road. Much of the roads in the sur...,"Marlinton has food, gas and hotels. Marlinton ...",,0.354167,1.39255,38.259878,-80.207685
4,34752,"Alabama's ""Mini-Dragon"" to 29 Dreams",Alabama,27.065368,2.33,9,11,6,4,5,...,,0,On this route you will find several homes and ...,The road is nicely paved and marked and has nu...,Upon getting to the Leeds exit from interstate...,,0.576923,1.581907,33.529228,-86.651468
128,34878,Starved Rock Run on Rt 71,Illinois,12.150456,4.2,7,8,11,5,4,...,,0,The area is relatively very hilly for Illinois...,"Nicely paved, some areas are narrow for a two ...","Starved Rock State Park has a nice lodge, camp...",,0.2,1.201002,41.317982,-88.935659
909,35680,Up to the Ridges,North Carolina,38.035487,4.5,2,18,6,5,4,...,outdoorchild92,5,The road goes through Cherokee and Pisgah nati...,Many mountain curves and good climbs here and ...,There are maybe one or two restaurants and a m...,,0.567308,1.374555,36.278032,-81.938544
