## Trajectary Generation

This step format trajectory data to a table with columns:

`uid`, `date`, `traj`. `traj_site`, `traj_arr`,

where `traj_site` indicates a list of location codes that a user visits one day. `traj_arr` is the same trajectory with area codes instead. 

In [15]:
from ast import literal_eval
from collections import defaultdict
from datetime import datetime
import os

import pandas as pd

In [16]:
PROJECT_ROOT = '..'

# Data files
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
TRAJECTORY_FILES = [os.path.join(DATA_DIR, f) for f in [
    'SET2_P01.user3000.csv',
]]
AREA_FILE = os.path.join(DATA_DIR, 'SITE_ARR_LONLAT.CSV')

# Build files
BUILD_PATH = os.path.join(PROJECT_ROOT, 'build')
TRAJ_TMP_FILE = os.path.join(BUILD_PATH, 'user3000.Jan.csv')
TRAJ_SEL_FILE = os.path.join(BUILD_PATH, 'selected.Jan.csv')
ARR_SEL_FILE = os.path.join(BUILD_PATH, 'selected.Jan.txt')

FORCE_REBUILD = True

In [17]:
def gen_traj(path, func_site2arr):
    """ Generate trajectory dataframe from original data file. """
    # Read csv from file.
    df = pd.read_csv(path,
                     names=['uid', 'date', 'traj_site'],
                     parse_dates=['date'],
                     date_parser=lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date())
    # Get trajectory per user per data and remove a record if its traj_site is the same
    # as the previous record.
    def to_traj(x):
        traj = []
        for v in x:
            if not traj or traj[-1] != v:
                traj += v,
        return tuple(traj)
    df = df.groupby(['uid', 'date']).aggregate(to_traj)
    # Append a column that indicate the trajectary of area_id, where the duplicated consecutive
    # area_ids in each record are removed.
    def remove_duplicate(traj_arr):
        res = []
        prev = -1
        for curr in traj_arr:
            if curr != prev:
                res += curr,
            prev = curr
        return tuple(res)
    df['traj_arr'] = df.traj_site.apply(lambda traj: remove_duplicate(map(func_site2arr, traj)))
    
    return df

def read_traj(path):
    """ Read generated trajectory datafrme for file. """
    df = pd.read_csv(path, index_col=[0,1], parse_dates=['date'])
    df['traj_site'] = df['traj_site'].apply(lambda x: literal_eval(x))
    df['traj_arr'] = df['traj_arr'].apply(lambda x: literal_eval(x))
    return df

if FORCE_REBUILD or not os.path.exists(TRAJ_TMP_FILE):
    df_arr = pd.read_csv(AREA_FILE, index_col=[0])
    DF_TRAJ = gen_traj(TRAJECTORY_FILES[0], lambda site_id: df_arr.arr_id[site_id])
    DF_TRAJ.to_csv(TRAJ_TMP_FILE)
else:
    DF_TRAJ = read_traj(TRAJ_TMP_FILE)
    
DF_TRAJ

Unnamed: 0_level_0,Unnamed: 1_level_0,traj_site,traj_arr
uid,date,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2013-01-07,"(461, 454, 327, 323)","(9, 6, 5)"
1,2013-01-08,"(323,)","(5,)"
1,2013-01-09,"(323, 318, 323, 318, 323)","(5,)"
1,2013-01-10,"(323,)","(5,)"
1,2013-01-11,"(323, 330, 318, 323)","(5,)"
1,2013-01-12,"(323,)","(5,)"
1,2013-01-13,"(323,)","(5,)"
1,2013-01-14,"(314, 312, 314, 323)","(5,)"
1,2013-01-15,"(323, 314, 323)","(5,)"
1,2013-01-16,"(323, 314, 312, 314, 323)","(5,)"


## Area Frequency Analysis

In [18]:
os.makedirs(BUILD_PATH, exist_ok=True)

In [19]:
ARR2UIDS = defaultdict(set)
for idx, row in DF_TRAJ.iterrows():
    for arr in row.traj_arr:
        ARR2UIDS[arr].add(idx[0])

ARR_SORTED = sorted(ARR2UIDS, key=lambda x: len(ARR2UIDS[x]), reverse=True)
    
with open(ARR_SEL_FILE, 'w') as file:
    file.write(str(ARR_SORTED[0]))

#for arr in ARR_SORTED[:3]:
#    print('%3s: %s' % (arr, str(ARR2UIDS[arr])))

## Select the Most Frequent Area

In [20]:
frames = []
for uid in sorted(ARR2UIDS[ARR_SORTED[0]]):
    frames += DF_TRAJ.iloc[DF_TRAJ.index.get_level_values('uid') == uid,:],
DF_MOST_FREQ = pd.concat(frames)

DF_MOST_FREQ

Unnamed: 0_level_0,Unnamed: 1_level_0,traj_site,traj_arr
uid,date,Unnamed: 2_level_1,Unnamed: 3_level_1
6,2013-01-09,"(1092, 781, 719, 709)","(70, 73)"
6,2013-01-10,"(362, 369, 256, 141, 174, 256, 174, 187)","(6, 7, 3)"
6,2013-01-11,"(205, 174, 187, 256)","(3,)"
6,2013-01-12,"(434, 419, 423, 419)","(9, 7)"
6,2013-01-13,"(434, 423, 419, 423, 419, 423, 419, 423)","(9, 7)"
6,2013-01-14,"(434, 179, 202, 179, 202, 179)","(9, 4)"
6,2013-01-15,"(179, 202, 228, 202, 179, 202, 219, 228, 202, ...","(4, 3, 6, 10, 19, 11, 20, 19)"
6,2013-01-16,"(1092,)","(70,)"
6,2013-01-17,"(1092,)","(70,)"
6,2013-01-18,"(1092, 1091)","(70,)"


In [21]:
DF_MOST_FREQ.to_csv(TRAJ_SEL_FILE)