In [1]:
# import necessary libraries/packages
import pandas as pd
import numpy as np
import os
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

User guide notes about data format
(provided with dataset download):
-----------------------------------
"Line 1…6 are useless in this dataset, and can be ignored. Points are described in following lines, one for each line.

- Field 1: Latitude in decimal degrees.
- Field 2: Longitude in decimal degrees.
- Field 3: All set to 0 for this dataset.
- Field 4: Altitude in feet (-777 if not valid).
- Field 5: Date - number of days (with fractional part) that have passed since 12/30/1899.
- Field 6: Date as a string.
- Field 7: Time as a string.

Note that field 5 and field 6&7 represent the same date/time in this dataset. You may use either of them."

In [2]:
# function for loading specific invidual PLT file given file_path
def load_plt_file(file_path):

    # load file as df while skipping first six rows (unneeded header info)
    plt_df = pd.read_csv(file_path, skiprows = 6, header = None)

    # assign columns
    plt_df.columns = [
        'latitude',
        'longitude', 
        'zero_field',
        'altitude_ft',
        'date_days',
        'date_string',
        'time_string'
    ]

    return plt_df

In [3]:
# function for adding plt data to existing df
def add_plt_data(df, file_path):

    # use load_plt_file on given file and extract data
    plt_df = load_plt_file(file_path)

    # add plt data to existing dataframe
    res_df = pd.concat([df, plt_df], axis = 0)

    return res_df

Filtering through all the folders and keeping track of the ones that have a 'labels.txt' file.

In [4]:
# define base directory holding the geolife data
geolife_dir = "../Data/GeoLife_Data"

# get all user folders within geolife directory
user_folders = [f for f in os.listdir(geolife_dir)
                if os.path.isdir(os.path.join(geolife_dir, f))
                and f.isdigit()]

# filter through the folders and check for labels.txt
folders_with_labels = []
for folder in user_folders:
    labels_path = os.path.join(geolife_dir, folder, "labels.txt")
    if os.path.exists(labels_path):
        folders_with_labels.append(folder)

folders_with_labels

['010',
 '020',
 '021',
 '052',
 '053',
 '056',
 '058',
 '059',
 '060',
 '062',
 '064',
 '065',
 '067',
 '068',
 '069',
 '073',
 '075',
 '076',
 '078',
 '080',
 '081',
 '082',
 '084',
 '085',
 '086',
 '087',
 '088',
 '089',
 '091',
 '092',
 '096',
 '097',
 '098',
 '100',
 '101',
 '102',
 '104',
 '105',
 '106',
 '107',
 '108',
 '110',
 '111',
 '112',
 '114',
 '115',
 '116',
 '117',
 '118',
 '124',
 '125',
 '126',
 '128',
 '129',
 '136',
 '138',
 '139',
 '141',
 '144',
 '147',
 '153',
 '154',
 '161',
 '163',
 '167',
 '170',
 '174',
 '175',
 '179']

For reference, the below is the information included in the user guide that regards the mode transportation labels.
--------------------------
"73 users have labeled their trajectories with transportation mode, such as driving, taking a bus, riding a bike and walking. There
is a label file storing the transportation mode labels in each user’s folder. See section 4.2 for the format of labels.
The total distance and duration of transportation modes are listed in Figure 6. Though this only covers a part of the dataset used
in the following papers, the scale of this released dataset can still support transportation mode learning."

Figure 6 Total distance and duration of transportation modes:
| Transportation Mode | Distance (km) | Duration (hour) |
| :------------------ | ------------: | --------------: |
| Walk                |        10,123 |           5,460 |
| Bike                |         6,495 |           2,410 |
| Bus                 |        20,281 |           1,507 |
| Car & taxi          |        32,866 |           2,384 |
| Train               |        36,253 |             745 |
| Airplane            |        24,789 |              40 |
| Other               |         9,493 |             404 |
| **Total**           |    **140,304**|       **12,953** |

--------------------------

"Possible transportation modes are: walk, bike, bus, car, subway, train, airplane, boat, run and motorcycle. Again, we have
converted the date/time of all labels to GMT, even though most of them were created in China.

Example:
| Start Time          | End Time            | Transportation Mode |
| :------------------ | :------------------ | :------------------ |
| 2008/04/02 11:24:21 | 2008/04/02 11:50:45 | bus                 |
| 2008/04/03 01:07:03 | 2008/04/03 11:31:55 | train               |
| 2008/04/03 11:32:24 | 2008/04/03 11:46:14 | walk                |
| 2008/04/03 11:47:14 | 2008/04/03 11:55:07 | car                 |

First, you can regard the label of both taxi and car as driving although we set them with different labels for future usage. Second, a
user could label the transportation mode of a light rail as train while others may use subway as the label. Actually, no trajectory
can be recorded in an underground subway system since a GPS logger cannot receive any signal there. In Beijing, the light rails
and subway systems are seamlessly connected, e.g., line 13 (a light rail) is connected with line 10 and line 2, which are subway
systems. Sometimes, a line (like line 5) is comprised of partial subways and partial light rails. So, users may have a variety of
understanding in their transportation modes. You can differentiate the real train trajectories (connecting two cities) from the light
rail trajectory (generating in a city) according to their distances. Or, just treat them the same."

----------------------

Now that we have a list of the folders in which the mode transportation labels are present and included, we can iterate through these folders and use the labels to filter for the relevant plt files (the ones labeled 'walk'). 

In [10]:
walk_plt_df = pd.DataFrame(columns = ['user_id', 'file_name'])

# iterate through the folders with labels
for f_id in folders_with_labels:

    f_path = f"../Data/GeoLife_Data/{f_id}"

    # read labels.txt as df
    labels_df = pd.read_csv(f"{f_path}/labels.txt", sep = "\t")

    # iterate through each walk file
    walk_df = labels_df[labels_df['Transportation Mode'] == 'walk']
    for time in walk_df['Start Time']:

        # convert Start Time str to datetime object
        dt_object = datetime.strptime(time, '%Y/%m/%d %H:%M:%S')
        
        # reformat date and time to the format of the plt file names (YYYYMMDDHHMMSS.plt)
        file_name = f"{dt_object.strftime('%Y%m%d%H%M%S')}.plt"

        # construct full file path
        file_path = os.path.join(f_path, "Trajectory", file_name)

        # check if file exists
        if os.path.exists(file_path):
            row_df = pd.DataFrame({'user_id': [f_id], 'file_name': file_name})
            walk_plt_df = pd.concat([walk_plt_df, row_df], ignore_index = True)
            print(f"Found plt file for {file_path}")
        else:
            print(f"File not found: {file_path} - skipping")
            continue
        

File not found: ../Data/GeoLife_Data/010\Trajectory\20080401010022.plt - skipping
File not found: ../Data/GeoLife_Data/010\Trajectory\20080401113037.plt - skipping
Found plt file for ../Data/GeoLife_Data/010\Trajectory\20080402060926.plt
File not found: ../Data/GeoLife_Data/010\Trajectory\20080403113224.plt - skipping
File not found: ../Data/GeoLife_Data/010\Trajectory\20080403115606.plt - skipping
File not found: ../Data/GeoLife_Data/010\Trajectory\20080404042725.plt - skipping
File not found: ../Data/GeoLife_Data/010\Trajectory\20080406012548.plt - skipping
Found plt file for ../Data/GeoLife_Data/010\Trajectory\20080613211847.plt
File not found: ../Data/GeoLife_Data/010\Trajectory\20080613214207.plt - skipping
File not found: ../Data/GeoLife_Data/010\Trajectory\20080613214453.plt - skipping
File not found: ../Data/GeoLife_Data/010\Trajectory\20080616023134.plt - skipping
File not found: ../Data/GeoLife_Data/010\Trajectory\20080616023705.plt - skipping
File not found: ../Data/GeoLife_