In [6]:
import os
import datetime

import pandas as pd
import numpy as np

import gpxpy
import srtm

from geopy import distance
from scipy.signal import find_peaks

from src.utils import smooth_fn

In [7]:
def process_gpx(file):
    df = pd.DataFrame(columns=['distance', 'elevation', 'duration', ])
    
    gpx_data = gpxpy.parse(file)

    try:
        elevation_data = srtm.get_data()
        elevation_data.add_elevations(gpx_data, smooth=True)
    except: return None
    
    gpx_points = gpx_data.tracks[0].segments[0].points

    # loop through the points and append their attributes to the dataframe
    for idx in range(1, len(gpx_points)): # index will count from 1 to lenght of dataframe, beginning with the second row
        point = gpx_points[idx]
        
        if point.elevation is None: continue
        
        start = gpx_points[idx-1]
        end = gpx_points[idx]
        dist = distance.distance((start.latitude, start.longitude), (end.latitude, end.longitude)).m
        duration = (end.time - start.time).total_seconds()
        elevation = end.elevation - start.elevation

        df = pd.concat([df, pd.DataFrame({'distance' : dist, 'elevation' : elevation, 'duration':  duration}, index=[0])], ignore_index=True)
    
    df['cumulative_distance'] = df.distance.cumsum()
    df['cumulative_duration'] = df.duration.cumsum()

    window = 50

    smooth = smooth_fn(df.elevation.values, window_len=window)

    #find the maximums
    peaks_idx_max, _ = find_peaks(smooth, prominence = 0.1)

    #reciprocal, so mins will become max
    smooth_rec = 1 / smooth

    #find the mins now
    peaks_idx_mins, _ = find_peaks(smooth_rec, prominence = 0.0001)

    peaks = *peaks_idx_max, *peaks_idx_mins

    if len(peaks) == 0:
        return None
    
    peaks = sorted(np.concatenate([[0, len(df) - 1], peaks]))

    df['elevation'] = smooth
    df = df.iloc[peaks]
    df = df.diff().iloc[1:]
    df['distance'] = df['cumulative_distance']
    df['duration'] = df['cumulative_duration']
    df['cumulative_distance'] = df.distance.cumsum()
    df['cumulative_duration'] = df.duration.cumsum()
    df['cumulative_positive_elevation'] = df[df['elevation'] > 0]['elevation'].cumsum()
    df['cumulative_negative_elevation'] = df[df['elevation'] < 0]['elevation'].cumsum()
    df['grade'] = df['elevation'] / df['distance'] * 100

    df = df.fillna(method='ffill')
    df = df.fillna(0)

    return df

In [8]:
cwd = os.path.dirname(os.getcwd())
df = pd.DataFrame()
cpt = 0
for filename in os.listdir(f"{cwd}/data/raw/"):
    try:
        file = open(f"{cwd}/data/raw/{filename}/{filename}.gpx", 'r')
    except NotADirectoryError:
        continue
    tmp = process_gpx(file)
    df = pd.concat([df, tmp])

  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec = 1 / smooth
  smooth_rec

In [9]:
df

Unnamed: 0,distance,elevation,duration,cumulative_distance,cumulative_duration,cumulative_positive_elevation,cumulative_negative_elevation,grade
35,185.616937,0.127536,35.0,185.616937,35.0,0.127536,0.000000,0.068709
51,82.741191,-0.188660,16.0,268.358128,51.0,0.127536,-0.188660,-0.228012
81,100.946403,-0.166039,30.0,369.304531,81.0,0.127536,-0.354699,-0.164483
107,100.623084,0.159417,26.0,469.927615,107.0,0.286953,-0.354699,0.158430
129,113.985754,0.177879,22.0,583.913370,129.0,0.464832,-0.354699,0.156053
...,...,...,...,...,...,...,...,...
5235,154.106554,-0.133689,35.0,23048.990681,6274.0,7.111135,-7.284693,-0.086751
5264,148.313107,0.128953,29.0,23197.303787,6303.0,7.240088,-7.284693,0.086946
5293,124.108496,-0.001054,29.0,23321.412283,6332.0,7.240088,-7.285747,-0.000849
5305,53.683291,-0.023458,12.0,23375.095574,6344.0,7.240088,-7.309205,-0.043698


In [11]:
from xgboost import XGBRegressor
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold

import matplotlib.pyplot as plt

x, y = df.drop(columns='duration'), df.duration
x.columns = range(len(x.columns))
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.15, random_state=42)
xgbc = XGBRegressor(n_estimators=250, learning_rate=0.01, max_depth=3, random_state=42)

xgbc.fit(xtrain, ytrain)

# - cross validataion
scores = cross_val_score(xgbc, xtrain, ytrain, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbc, xtrain, ytrain, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

threshold = 0.65
# ypred = xgbc.predict(xtest)
ypred = (xgbc.predict_proba(xtest)[:, 1] > threshold).astype('float')
cm = confusion_matrix(ytest, ypred, normalize='all')
print(cm)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 9))
# plot_confusion_matrix(xgbc, x, y, ax=ax1, normalize='all') 
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax1)

ax2.bar(df.drop(columns='label').columns, xgbc.feature_importances_)




Mean cross-validation score: 0.29
K-fold CV average score: 0.29


AttributeError: 'XGBRegressor' object has no attribute 'predict_proba'

In [12]:
xgbc.predict(xtest)[:, 1]

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [13]:
xtest

Unnamed: 0,0,1,2,3,4,5,6
2858,140.776693,-0.001067,9791.536072,3272.0,3.158113,-3.241887,-0.000758
1664,165.022231,0.044996,5537.238392,1741.0,1.082906,-1.228949,0.027266
1436,145.539399,0.138603,4887.260574,2269.0,1.474549,-1.455158,0.095234
3670,122.468667,0.225614,13123.317788,3680.0,6.479509,-6.198895,0.184222
1889,67.289133,0.108689,6790.261840,1917.0,1.901620,-1.785744,0.161525
...,...,...,...,...,...,...,...
327,30.979446,-0.022095,1377.971728,327.0,1.092879,-0.935214,-0.071321
3057,67.373919,-0.060007,10048.440897,3185.0,3.535090,-3.660946,-0.089066
1682,104.694837,0.015732,9539.624733,6113.0,16.355740,-16.593633,0.015026
1795,142.739625,-0.109387,5689.009304,1867.0,1.374896,-1.498967,-0.076634
