In [1]:
import numpy as np
import pandas as pd
import altair as alt
from vega_datasets import data
import requests
import time
import geopandas
import dill
import gzip
from gpx_converter import Converter
from shapely.geometry import LineString
import matplotlib.pyplot as plt
from os.path import exists as file_exists
import warnings
warnings.filterwarnings(action='once')

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn import metrics, tree

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS = STOP_WORDS.union({'ll', 've', 'pron'})


In [3]:
def get_data(data_path):
    """Return loaded data from disk."""
    with gzip.open(data_path, 'rb') as f:
        return dill.load(f)

# Data input & cleaning

In [2]:
# calculates the sinuosity of each route from its gpx file of lat/lon coordinates
def calcluate_sinuosity(gpx_file_num):
    gpx_file = f'gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_array = Converter(input_file=gpx_file).gpx_to_numpy_array()
        except Exception:
            return -1
        
        splits = 4
        subsets = np.array_split(gpx_array, splits)
        subset_sinuosities = []
        
        for subset in subsets:
            start_pt = subset[0]
            end_pt = subset[-1]
            route = LineString(subset)
            route_SL = LineString((start_pt, end_pt))
            route_sinuosity = route.length / route_SL.length
            subset_sinuosities.append(route_sinuosity)
        return sum(subset_sinuosities)/splits
    else:
        return -2

In [21]:
route_data_RAW = pd.read_csv('data/route_data2.csv').drop('files',axis=1)
# s = [calcluate_sinuosity(x) for x in route_data_RAW['gpx_file_num']]
# route_data_RAW['sinuosity'] = s
route_data_RAW.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2058 entries, 0 to 2057
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gpx_file_num                 2058 non-null   int64  
 1   name                         2058 non-null   object 
 2   state                        2058 non-null   object 
 3   route_length                 2058 non-null   int64  
 4   user_rating                  2058 non-null   float64
 5   num_user_reviews             2058 non-null   int64  
 6   num_users_rode               2058 non-null   int64  
 7   num_users_want2ride          2058 non-null   int64  
 8   scenery_rating               2058 non-null   int64  
 9   drive_enjoyment_rating       2058 non-null   int64  
 10  tourism_rating               2058 non-null   int64  
 11  state_rank                   2058 non-null   int64  
 12  num_state_routes             2058 non-null   int64  
 13  author            

We only want routes that have ratings in our training set.

In [None]:
comments = (
    pd.read_csv('comments.csv')
    .drop('files', axis=1)
    .dropna()
    .groupby('route_name', as_index=False)
    .agg(lambda x: ' '.join(x))
    .drop_duplicates()
)

rated_roads = (
    route_data_RAW.query('num_user_reviews > 0 and sinuosity >= 0')
    #.merge(comments, how='left', left_on='name',right_on='route_name')
    .fillna(' ')
)

And we will limit scope to routes in the US, including routes that cross state lines.

In [None]:
valid_states = ['Alabama', 'California', 'Georgia', 'Missouri', 'Illinois', 'Ohio',
       'Kentucky', 'Colorado', 'United States', 'Indiana', 'New York',
       'Vermont', 'Texas', 'Florida', 'Minnesota', 'Virginia',
       'Oklahoma', 'Arkansas', 'Maryland', 'West Virginia',
       'Michigan', 'North Carolina', 'Oregon', 'Pennsylvania',
       'Washington', 'New Jersey', 'Alaska',
       'South Carolina', 'Utah', 'New Hampshire', 'Iowa', 'Louisiana',
       'Mississippi', 'Wisconsin',
       'South Dakota', 'Wyoming', 'Massachusetts', 'New Mexico',
       'Montana', 'Idaho', 'Nevada', 'Arizona',
       'Kansas', 'Northeast', 'Southwest', 'Golf Coast', 'Southeast',
       'Tennessee', 'Nebraska', 'Delaware', 'Pacific Coast',
       'Appalachian Mountains', 'Maine', 'Rhode Island', 'Connecticut',
       'North Dakota', 'Hawaii']
us_route_data = rated_roads[rated_roads.state.isin(valid_states)]

In [None]:
#us_route_data['weighted_rating'] = us_route_data['user_rating'] * us_route_data['num_user_reviews']
us_route_data['description'] = us_route_data.agg(lambda x: f"{x['scenery_description']}, {x['drive_enjoyment_description']}, {x['tourism_description']}", axis=1)

In [None]:
# Make a geoDataFrame with route coords as the geometry
def get_route_coords(gpx_file_num):
    gpx_file = f'gpx_files/{str(gpx_file_num)}.gpx'
    if file_exists(gpx_file):
        try:
            gpx_df = Converter(input_file=gpx_file).gpx_to_dataframe()
            route_line = LineString(list(zip(gpx_df.longitude, gpx_df.latitude)))
            return route_line
        except Exception:
            return None

route_coords = {
    'gpx_file_num': [x for x in us_route_data['gpx_file_num']],
    'geometry': [get_route_coords(x) for x in us_route_data['gpx_file_num']]
}

#also used EPSG: 4326
route_coords_gdf = geopandas.GeoDataFrame(route_coords, crs="EPSG:4269").merge(us_route_data, on='gpx_file_num')

Dataset of NPS Park boundaries: https://irma.nps.gov/DataStore/Reference/Profile/2225713

In [None]:
# # # calculate a route's shortest distance to a NPS site 
park_data = geopandas.read_file("data/nps_boundary/nps_boundary.shp")

# route_coords_gdf['centroid'] = route_coords_gdf['geometry'].to_crs(epsg=4269).centroid
# park_data['centroid'] = park_data['geometry'].to_crs(epsg=4269).centroid

# us_route_data['distance2nps'] = route_coords_gdf.apply(lambda x: park_data['centroid'].distance(x['centroid']).min(),axis=1)

In [None]:
#Final dataset for analyses
route_coords_gdf.head()

# Visualizations

In [6]:
route_df = get_data('data/route_df.pkl')

In [7]:
route_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2063 entries, 0 to 2062
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gpx_file_num                 2063 non-null   int64  
 1   name                         2063 non-null   object 
 2   state                        2063 non-null   object 
 3   user_rating                  2063 non-null   float64
 4   num_user_reviews             2063 non-null   int64  
 5   num_users_rode               2063 non-null   int64  
 6   num_users_want2ride          2063 non-null   int64  
 7   scenery_rating               2063 non-null   int64  
 8   drive_enjoyment_rating       2063 non-null   int64  
 9   tourism_rating               2063 non-null   int64  
 10  state_rank                   2063 non-null   int64  
 11  num_state_routes             2063 non-null   int64  
 12  scenery_description          2063 non-null   object 
 13  drive_enjoyment_de

In [8]:
alt.Chart(route_df).mark_point().encode(
    alt.Y(alt.repeat("column"), type='quantitative'),
    alt.X(alt.repeat("row"), type='quantitative'),
#     color='tourism_rating:Q'
).properties(
    width=200,
    height=200
).repeat(
    row=['route_length', 'sinuosity'],
    column=['user_rating']
).interactive()


In [None]:
x = 'route_length'

rate = alt.Chart(route_df).mark_point().encode(
    x='route_length',
    y='user_rating',
    tooltip=['name','state','user_rating']
).interactive()

weighted = alt.Chart(route_df).mark_point().encode(
    x='sinuosity',
    y='user_rating',
    tooltip=['name','state','user_rating']
).interactive()

rateXpop = alt.Chart(route_df).mark_point().encode(
    x=x,
    y='rateXpop',
    tooltip=['name','state','user_rating']
).interactive()

alt.vconcat(rate,weighted) #| rateXpop

In [None]:
scenery_chart = alt.Chart(rated_roads).mark_circle().encode(
    alt.X('scenery_rating', bin=True),
    alt.Y('user_rating', bin=True),
    size='count()'
)

drive_enjoyment_chart = alt.Chart(rated_roads).mark_circle().encode(
    alt.X('drive_enjoyment_rating', bin=True),
    alt.Y('user_rating', bin=True),
    size='count()'
)

tourism_chart = alt.Chart(rated_roads).mark_circle().encode(
    alt.X('tourism_rating', bin=True),
    alt.Y('user_rating', bin=True),
    size='count()'
)

scenery_chart | drive_enjoyment_chart | tourism_chart

In [None]:
alt.Chart(us_route_data).mark_point().encode(
    x='distance2nps',
    y='user_rating',
    tooltip=['name','state','user_rating']
).interactive()

## Geospatial features

In [None]:
# Maps routes w/ parks

# US states background
states = alt.topo_feature(data.us_10m.url, feature='states')
background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
).properties(
    width=1000,
    height=600
).project('albersUsa')

# MR routes
lines = alt.Chart(route_coords_gdf).mark_geoshape(
    filled=False,
    strokeWidth=1
).encode(color='user_rating')


# NPS sites
parks = alt.Chart(park_data).mark_geoshape(
        color='brown',
        filled=False,
        strokeWidth=1)


layered_map = background + lines + parks
layered_map.configure_view(strokeWidth=0)

# Analyses

## Numerical features & Bag of words

In [5]:
route_df = get_data('data/route_df.pkl')
route_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2063 entries, 0 to 2062
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gpx_file_num                 2063 non-null   int64  
 1   name                         2063 non-null   object 
 2   state                        2063 non-null   object 
 3   user_rating                  2063 non-null   float64
 4   num_user_reviews             2063 non-null   int64  
 5   num_users_rode               2063 non-null   int64  
 6   num_users_want2ride          2063 non-null   int64  
 7   scenery_rating               2063 non-null   int64  
 8   drive_enjoyment_rating       2063 non-null   int64  
 9   tourism_rating               2063 non-null   int64  
 10  state_rank                   2063 non-null   int64  
 11  num_state_routes             2063 non-null   int64  
 12  scenery_description          2063 non-null   object 
 13  drive_enjoyment_de

In [12]:
route_df['weighted_rating'] = route_df.user_rating * route_df.num_user_reviews
route_df['rateXpop'] = route_df.user_rating * (route_df.num_user_reviews + route_df.num_users_rode + route_df.num_users_want2ride)

In [25]:
# load/split data
X = route_df
y = route_df['user_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

# select features
numeric_features = ['sinuosity', 'route_length', 
                    'num_user_reviews', 'num_users_rode', 'num_users_want2ride', 
                    'loc_lat','loc_lon']
text_features = ['scenery_description','drive_enjoyment_description','tourism_description']

# preprocessing
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())]
)
text_transformer = TfidfVectorizer(stop_words=STOP_WORDS.union({'10'}), 
                                   ngram_range=(1,2),
                                   max_features=500,
                                   min_df=.05  
                                  )

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    #('comments',text_transformer, 'comments'),
#     ('scenery', text_transformer, 'scenery_description'),
#     ('drive_enjoyment', text_transformer, 'drive_enjoyment_description'),
#     ('tourism', text_transformer, 'tourism_description'),
    ('state', OneHotEncoder(handle_unknown='ignore'), ['state'])
])


# pipeline

params = {
    "n_estimators": 80,
    "max_depth": 8,
    "min_samples_split": 10,
    "learning_rate": 0.05,
}

est = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RidgeCV())#GradientBoostingRegressor(**params))
])

# param_grid = {
#      "regressor__n_estimators": range(20,81,10),
#     "regressor__max_depth": np.linspace(1,10,10,dtype='int64'),
#     "regressor__min_samples_split": np.linspace(1,10,10,dtype='int64'),
#     "regressor__learning_rate": 0.01,
# }
# est = GridSearchCV(pipe,param_grid,cv=3, n_jobs=2, verbose=1)

est.fit(X_train,y_train)
y_pred = est.predict(X_train)
print("Mean absolute error:", metrics.mean_absolute_error(y_train, y_pred))
print("Mean squared error:", metrics.mean_squared_error(y_train, y_pred))
print("R^2 train:", metrics.r2_score(y_train, y_pred))

Mean absolute error: 1.1997400592926757
Mean squared error: 2.3225726916637135
R^2 train: 0.20999866894565


In [26]:
y_pred = est.predict(X_test)
print("Mean absolute error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean squared error:", metrics.mean_squared_error(y_test, y_pred))
print("R^2 test:", metrics.r2_score(y_test, y_pred))

Mean absolute error: 1.2166607720991822
Mean squared error: 2.423224016746279
R^2 test: 0.09947898353056561


In [155]:
est.

{'regressor__n_estimators': 80} 0.8787748641884322


## Sentiment Analysis

In [None]:
# split routes into hi and low ratings
sorted_ratings = us_route_data.sort_values(by=['user_rating','num_user_reviews'],ascending=False).reset_index()
hi,mid,low = np.split(sorted_ratings,[int(.3*len(sorted_ratings)), int(.6*len(sorted_ratings))])

# add rating labels
hi['rating_label'] = 'hi'
low['rating_label'] = 'low'

# merge labeled data
polar_data = pd.concat([hi,low],ignore_index=True)
polar_X = polar_data['comments']
polar_y = polar_data['rating_label']

# split data
X_train, X_test, y_train, y_test = train_test_split(polar_X, polar_y, test_size=0.2, random_state=17)


polar_pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=STOP_WORDS.union({'10'}), 
                                   ngram_range=(1,2),
                                   min_df=.1, 
                                   max_features = 500)),
    ('classifier', MultinomialNB())
])

polar_pipe.fit(X_train,y_train)

In [None]:
polar_pipe[0].get_feature_names_out()

In [None]:
vocab = polar_pipe.get_params()['vectorizer'].vocabulary_ 
                                                  
coeff_pos = polar_pipe.get_params()['classifier'].feature_log_prob_[0] 
coeff_neg = polar_pipe.get_params()['classifier'].feature_log_prob_[1]


from numpy import argsort

polarity = coeff_pos - coeff_neg
indices = argsort(polarity) # indices of the polarity list, sorted from least to greatest


print("Top Words \n-----")
for word in vocab:
    if vocab[word] in indices[-25:]:
        print(word)
        
# print("\nNegative Words \n-----")
# for word in vocab:
#     if vocab[word] in indices[:25]:
#         print(word) 

In [None]:
# topic modeling on comments to generate 'category' features for each route
# recommendation engine for routes

# Extra Code

In [12]:
# used to get gpx files from MotoRoads site
gpxs = pd.read_csv('data/route_data2.csv').gpx_file_num
for gpx in gpxs:
    gpx_file = f'gpx_files/{str(gpx)}.gpx'
    
    if file_exists(gpx_file):
        continue
    else:
        moto = requests.get('https://www.motorcycleroads.com/downloadgpx/' + str(gpx))
        out = moto.text
        name = f'gpx_files/{str(gpx)}.gpx'
        with open(name, 'w') as f:
            f.write(out)
        time.sleep(2)