# Explore GPS Data

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import date

today = date.today()
week_ago = today - dt.timedelta(days=7)
today = today.strftime("%d%m%Y")
week_ago = week_ago.strftime("%Y-%m-%d")

datapath = "/Users/leonahammelrath/FU_Psychoinformatik/Github/tiki_code/data/"
filepath = datapath + f"export_{today}.csv"

datapath1 = datapath + f"export_tiki_{today}/"
filepath_1 = datapath1 + "epoch_part0001.csv"
filepath_2 = datapath1 + "epoch_part0002.csv"
filepath_3 = datapath1 + "epoch_part0003.csv"

In [2]:
df_1 = pd.read_csv(filepath_1, encoding= "latin-1", low_memory=False)
df_2 = pd.read_csv(filepath_2, encoding= "latin-1", low_memory=False)
df_3 = pd.read_csv(filepath_3, encoding= "latin-1", low_memory=False)


In [3]:
df_location_1 = df_1[df_1.type.isin(["Latitude", "Longitude"])]
df_location_2 = df_2[df_2.type.isin(["Latitude", "Longitude"])]
df_location_3 = df_3[df_3.type.isin(["Latitude", "Longitude"])]

In [4]:
df_complete = pd.concat([df_location_1, df_location_2, df_location_3])

In [5]:
df_complete.head()

Unnamed: 0,customer,source,startTimestamp,endTimestamp,type,valueType,doubleValue,longValue,booleanValue,dateValue,stringValue,generation,trustworthiness,medicalGrade,userReliability,chronologicalExactness,timezoneOffset,createdAt
0,3ZqHxrSNghmWFFvs,CustomerApp,1687261609000,,Latitude,10,31.71553,,,,,,,,,,120.0,1687263407936
1,3ZqHxrSNghmWFFvs,CustomerApp,1687261609000,,Longitude,10,-71.224519,,,,,,,,,,120.0,1687263407936
2,3ZqHxrSNghmWFFvs,CustomerApp,1687261669000,,Latitude,10,31.71554,,,,,,,,,,120.0,1687263407937
3,3ZqHxrSNghmWFFvs,CustomerApp,1687261669000,,Longitude,10,-71.224509,,,,,,,,,,120.0,1687263407937
4,3ZqHxrSNghmWFFvs,CustomerApp,1687262177000,,Latitude,10,31.71556,,,,,,,,,,120.0,1687263407937


In [6]:
df_complete = df_complete[["customer", "startTimestamp", "type", "doubleValue", 
                           "timezoneOffset"]]

In [7]:
df_complete["customer"] = df_complete.customer.str.split("@").str.get(0)
df_complete["customer"] = df_complete["customer"].str[:4]

In [8]:
df_complete["timezoneOffset"] = df_complete["timezoneOffset"] * 60000

In [9]:
df_complete["timezone"] = df_complete["startTimestamp"] + df_complete["timezoneOffset"]

In [10]:
df_complete["timezone"] = (pd.to_datetime(df_complete["timezone"],unit='ms'))
df_complete["startTimestamp"] = (pd.to_datetime(df_complete["startTimestamp"],unit='ms'))

## Analyze GPS Data

In [11]:
df_int = df_complete.pivot(
    index=["customer", "startTimestamp"],
    columns="type",
    values="doubleValue")

In [12]:
df_int = df_int.rename_axis(None, axis=1).reset_index()

In [13]:
df_int.shape

(589584, 4)

In [14]:
df_int.head()

Unnamed: 0,customer,startTimestamp,Latitude,Longitude
0,0ePW,2023-07-03 11:44:07,4.065911,7.9798
1,0ePW,2023-07-03 11:44:09,4.065911,7.9798
2,0ePW,2023-07-03 12:27:31,4.068711,7.9797
3,0ePW,2023-07-03 12:27:39,4.068841,7.97965
4,0ePW,2023-07-03 12:34:52,4.072661,7.97652


In [15]:
df_int= df_int.sort_values(by=["customer", "startTimestamp"]).drop_duplicates(subset=["startTimestamp"], keep="last")

In [16]:
#df_int = df_int.sort_values(by="customer").drop_duplicates(subset=["startTimestamp"], keep="last")

In [17]:
df_int.shape

(567954, 4)

In [18]:
df_int.head()

Unnamed: 0,customer,startTimestamp,Latitude,Longitude
0,0ePW,2023-07-03 11:44:07,4.065911,7.9798
1,0ePW,2023-07-03 11:44:09,4.065911,7.9798
2,0ePW,2023-07-03 12:27:31,4.068711,7.9797
3,0ePW,2023-07-03 12:27:39,4.068841,7.97965
4,0ePW,2023-07-03 12:34:52,4.072661,7.97652


In [19]:
#df_int.reset_index(level="day", inplace=True)

In [20]:
df_int.dropna(subset = ['Latitude', 'Longitude', 'startTimestamp'], inplace=True)

In [21]:
df_int["day"] = df_int.startTimestamp.dt.strftime('%Y/%m/%d')
df_int["hours"] = df_int.startTimestamp.dt.hour


In [22]:
df_int["n_hours"] = df_int.groupby(["customer", "day"])["hours"].transform("nunique")

In [23]:
df_int["n_days"] = df_int.groupby("customer")["day"].transform("nunique")

In [24]:
df_int

Unnamed: 0,customer,startTimestamp,Latitude,Longitude,day,hours,n_hours,n_days
0,0ePW,2023-07-03 11:44:07,4.065911,7.979800,2023/07/03,11,2,77
1,0ePW,2023-07-03 11:44:09,4.065911,7.979800,2023/07/03,11,2,77
2,0ePW,2023-07-03 12:27:31,4.068711,7.979700,2023/07/03,12,2,77
3,0ePW,2023-07-03 12:27:39,4.068841,7.979650,2023/07/03,12,2,77
4,0ePW,2023-07-03 12:34:52,4.072661,7.976520,2023/07/03,12,2,77
...,...,...,...,...,...,...,...,...
589579,yv0Q,2023-09-24 22:17:52,65.982271,54.461462,2023/09/24,22,13,7
589580,yv0Q,2023-09-24 22:46:43,65.982211,54.461352,2023/09/24,22,13,7
589581,yv0Q,2023-09-24 23:20:55,65.982291,54.461522,2023/09/24,23,13,7
589582,yv0Q,2023-09-24 23:50:43,65.982271,54.461462,2023/09/24,23,13,7


In [25]:
df_valid = df_int.loc[(df_int.n_hours >= 15) & (df_int.n_days >= 3)]

In [26]:
df_valid

Unnamed: 0,customer,startTimestamp,Latitude,Longitude,day,hours,n_hours,n_days
53,0ePW,2023-07-05 00:03:39,4.075211,7.980170,2023/07/05,0,17,77
54,0ePW,2023-07-05 00:33:26,4.075221,7.980150,2023/07/05,0,17,77
55,0ePW,2023-07-05 01:04:23,4.075061,7.980070,2023/07/05,1,17,77
56,0ePW,2023-07-05 01:34:39,4.075211,7.980170,2023/07/05,1,17,77
57,0ePW,2023-07-05 02:04:42,4.075211,7.980170,2023/07/05,2,17,77
...,...,...,...,...,...,...,...,...
584910,vPEr,2023-08-22 18:48:40,-7.318533,-147.345473,2023/08/22,18,19,16
584911,vPEr,2023-08-22 18:49:15,-7.318503,-147.345543,2023/08/22,18,19,16
584912,vPEr,2023-08-22 18:50:44,-7.318493,-147.345453,2023/08/22,18,19,16
584913,vPEr,2023-08-22 18:52:26,-7.318283,-147.345423,2023/08/22,18,19,16


In [27]:
df_valid.sort_values(by=['customer', 'day'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [28]:
# Calculate 'day_index' by checking for changes in 'date.split' within each 'userID' group
df_valid['day_index'] = (df_valid['day'] != df_valid.groupby('customer')['day'].shift()).cumsum()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
df_valid.head()

Unnamed: 0,customer,startTimestamp,Latitude,Longitude,day,hours,n_hours,n_days,day_index
53,0ePW,2023-07-05 00:03:39,4.075211,7.98017,2023/07/05,0,17,77,1
54,0ePW,2023-07-05 00:33:26,4.075221,7.98015,2023/07/05,0,17,77,1
55,0ePW,2023-07-05 01:04:23,4.075061,7.98007,2023/07/05,1,17,77,1
56,0ePW,2023-07-05 01:34:39,4.075211,7.98017,2023/07/05,1,17,77,1
57,0ePW,2023-07-05 02:04:42,4.075211,7.98017,2023/07/05,2,17,77,1


In [30]:
df_valid['hourID'] = df_valid['customer'].astype(str) + df_valid['day_index']\
.astype(str) + df_valid['hours'].astype(str).str.zfill(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [31]:
df_valid.head()

Unnamed: 0,customer,startTimestamp,Latitude,Longitude,day,hours,n_hours,n_days,day_index,hourID
53,0ePW,2023-07-05 00:03:39,4.075211,7.98017,2023/07/05,0,17,77,1,0ePW1000
54,0ePW,2023-07-05 00:33:26,4.075221,7.98015,2023/07/05,0,17,77,1,0ePW1000
55,0ePW,2023-07-05 01:04:23,4.075061,7.98007,2023/07/05,1,17,77,1,0ePW1001
56,0ePW,2023-07-05 01:34:39,4.075211,7.98017,2023/07/05,1,17,77,1,0ePW1001
57,0ePW,2023-07-05 02:04:42,4.075211,7.98017,2023/07/05,2,17,77,1,0ePW1002


In [None]:
import geopandas as gpd
from sklearn.cluster import DBSCAN

# Assuming you have a GeoDataFrame 'geodata' with geometry and 'userID' column
# You should also have imported the necessary libraries

# Function for clustering
def db2(x):
    geodata = x.copy()
    cluster_20 = DBSCAN(eps=25, min_samples=180).fit(geodata.geometry.to_numpy().reshape(-1, 1))
    return pd.DataFrame({'cluster_20m': cluster_20.labels_})

# Apply function to cluster points
geodata_cluster_df = geodata.groupby('userID').apply(lambda x: db2(x)).reset_index(drop=True)

# Merge this with the main GeoDataFrame
geodata_clusters = gpd.GeoDataFrame(pd.concat([geodata, geodata_cluster_df], axis=1))
