In [5]:
import pandas as pd
import numpy as np
import scipy
from geopy.distance import geodesic
from stridze.db import engine

conn = engine.connect()
df = pd.read_sql_table('strava', conn)

In [6]:
print(len(df))

6023053


In [49]:
df.groupby('user_id').activity_id.nunique()

user_id
42265442    1436
73182431     307
Name: activity_id, dtype: int64

In [50]:
def calculate_distance(points):
    distance = 0.0
    previous_point = None

    for point in points:
        if previous_point is not None:
            distance += geodesic((previous_point[0], previous_point[1]),
                                    (point[0], point[1])).meters
        previous_point = point

    return distance

In [51]:
df.timestamp = pd.to_datetime(df.timestamp)
df['date'] = df.timestamp.dt.date

In [52]:
df = df.drop(columns=['id'])
df = df.drop_duplicates()

In [53]:
matching_activities = pd.DataFrame()

for date, elt in df.groupby('date'):
    nb_users = elt.user_id.nunique()

    if nb_users == 2:
        user_1_activities = elt[elt.user_id == elt.user_id.unique()[0]].activity_id.unique()
        user_2_activities = elt[elt.user_id == elt.user_id.unique()[1]].activity_id.unique()

        activities = [(x, y) for y in user_1_activities for x in user_2_activities]

        # activities = elt.activity_id.unique()

        # print(f"Found {len(activities)} activities for {date}")
        for activity in activities:
            new_activity = pd.DataFrame({
                'date': [date],
                'activity_id_1': [activity[0]],
                'activity_id_2': [activity[1]],
                'type': [elt.loc[elt.activity_id == activity[0], 'activity_type'].unique()[0]],
            })
            # print(f"New activity : {new_activity}")
            matching_activities = pd.concat([matching_activities, new_activity], ignore_index=True)
matching_activities = matching_activities.drop(columns=['date']).drop_duplicates()


In [54]:
result = pd.DataFrame()

for idx, row in matching_activities.iterrows():
    activity_1 = df.loc[df.activity_id == row.activity_id_1]
    activity_2 = df.loc[df.activity_id == row.activity_id_2]

    track1 = activity_1[['latitude', 'longitude']].values
    track2 = activity_2[['latitude', 'longitude']].values

    activity = pd.merge(activity_1, activity_2, on='timestamp', how='inner')
    if activity.empty:
        # print("No matching coordinates found")
        continue
    activity['distance'] = activity.apply(lambda row: geodesic((row.latitude_x, row.longitude_x), (row.latitude_y, row.longitude_y)).meters, axis=1)
    if activity.distance.median() < 500:
        new_activity = pd.DataFrame({
            'activity_id_1': [row.activity_id_1],
            'activity_id_2': [row.activity_id_2],
            'type': [row.type],
            'distance': [calculate_distance(activity[['latitude_x', 'longitude_x']].values) / 1000.],
        })
        result = pd.concat([result, new_activity], ignore_index=True)
    else:
        print(f"Median distance between {row.activity_id_1} and {row.activity_id_2} : {activity.distance.median()} meters")
    # for idx in range(gpx_len):
    #     distance = geodesic(track1[idx], track2[idx]).meters
    #     time_difference = abs((activity_1.timestamp.iloc[idx] - activity_2.timestamp.iloc[idx]).total_seconds())
    #     if (distance <= radius and time_difference <= delay):
    #         matching_coordinates.append(track1[idx])
    #     else: print(f"No match found for this point {distance} meters away, {time_difference} seconds away")
    # distance = calculate_distance(matching_coordinates)
    # print(f"Distance between {row.activity_id_1} and {row.activity_id_2} : {distance} meters")

Median distance between 4670838236 and 4670723147 : 143656.7594719834 meters
Median distance between 5157593429 and 5157585529 : 1962.3235129513291 meters
Median distance between 5340009315 and 5341315641 : 181316.91943952692 meters
Median distance between 5532926182 and 5524221230 : 1179.6821739352606 meters
Median distance between 5539489686 and 5524221230 : 1472.7917829824466 meters
Median distance between 5916329836 and 5917506587 : 496319.2901455315 meters
Median distance between 6094935414 and 6097618767 : 396668.37787461985 meters
Median distance between 6239447479 and 6239460793 : 16198.897453413125 meters
Median distance between 6582032274 and 6606037911 : 3838.754223754637 meters
Median distance between 6714347368 and 6713965757 : 82627.79012016782 meters
Median distance between 6714345574 and 6713965757 : 118569.1415803875 meters
Median distance between 6743337143 and 6743311070 : 740111.1624274104 meters
Median distance between 6903934431 and 6904003689 : 13168.632148962852

In [65]:
m = result.groupby('type').agg(['sum','count'])['distance']
# m = result.groupby('type').sum()[['distance']]
m = m.sort_values('sum', ascending=False)
m.loc['total']=m.sum()
m = m.rename(columns={'sum': 'Distance (km)', 'count': 'Activities (#)'})
m = m.astype(int)
m

Unnamed: 0_level_0,Distance (km),Activities (#)
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Run,598,41
Ride,239,4
TrailRun,215,6
Hike,151,11
NordicSki,115,5
Canoeing,52,1
AlpineSki,23,1
Walk,20,2
total,1416,71


In [66]:
result.loc[result.type =='Walk']

Unnamed: 0,activity_id_1,activity_id_2,type,distance
1,4269030781,4421264323,Walk,7.580504
5,4434970701,4435056210,Walk,12.579076


In [33]:
import stravalib

In [193]:
client = stravalib.Client('2422d447c0c50c458349ba4280c26208efbdbf3e')

In [202]:
client.refresh_access_token('66078', '8f136bd4e4b40ce3591293ec8c282114c34e5219', '474ef6ea5b89d58c88de5303d21d69599748c94d')

{'access_token': '2422d447c0c50c458349ba4280c26208efbdbf3e',
 'refresh_token': '474ef6ea5b89d58c88de5303d21d69599748c94d',
 'expires_at': 1696882110}