In [1]:
import requests
import json
import csv
import os
import time
import sqlalchemy
import pandas
import pickle
import numpy
import numpy as np
from datetime import datetime, timedelta
import geopy
from geopy.distance import vincenty
# from sshtunnel import SSHTunnelForwarder #Run pip install sshtunnel
from sqlalchemy.orm import sessionmaker #Run pip install sqlalchemy
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Logging ver. 2016-07-12
from logging import handlers
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
fh = logging.handlers.RotatingFileHandler('log.log', maxBytes=1000000, backupCount=3)  # file handler
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()  # console handler
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
logger.info('Initializing %s', __name__)

import settings as s
np.random.seed(7)


# define_temporal_slices
def define_temporal_index(EXPERIMENT_PARAMETERS):
    logger.info("Defining temporal indices")
    temporal_indices = []
    # temporal_indices = dict()
    temporal_index = 0
    timestart = datetime.strptime(EXPERIMENT_PARAMETERS["TIMESTART"], '%Y-%m-%d %H:%M:%S')
    timeend = datetime.strptime(EXPERIMENT_PARAMETERS["TIMEEND"], '%Y-%m-%d %H:%M:%S')
    unit_temporal = timedelta(minutes=EXPERIMENT_PARAMETERS["UNIT_TEMPORAL"])
    time_cursor = timestart

    while (time_cursor < timeend):
        # time_cursor_str = datetime.strftime(time_cursor, '%Y-%m-%d %H:%M:%S')
        iso_year, iso_week_number, iso_weekday = time_cursor.isocalendar()
        temporal_indices.append([temporal_index, iso_year, iso_week_number, iso_weekday, time_cursor])
        # temporal_indices[temporal_index] = time_cursor
        temporal_index += 1
        time_cursor = time_cursor + unit_temporal
        # logger.debug(time_cursor_str)
    return temporal_indices


def define_spatial_index(EXPERIMENT_PARAMETERS):
    x1y1 = (EXPERIMENT_PARAMETERS["AOI"][1], EXPERIMENT_PARAMETERS["AOI"][0])
    x2y1 = (EXPERIMENT_PARAMETERS["AOI"][1], EXPERIMENT_PARAMETERS["AOI"][2])
    x1y2 = (EXPERIMENT_PARAMETERS["AOI"][3], EXPERIMENT_PARAMETERS["AOI"][0])
    x2y2 = (EXPERIMENT_PARAMETERS["AOI"][3], EXPERIMENT_PARAMETERS["AOI"][2])
    x_distance = geopy.distance.vincenty(x1y1, x2y1).meters
    y_distance = geopy.distance.vincenty(x1y1, x1y2).meters
    logger.debug("X distance: %s meters, Y distance: %s meters", x_distance, y_distance)
    x_unit_degree = round((((EXPERIMENT_PARAMETERS["AOI"][2] - EXPERIMENT_PARAMETERS["AOI"][0]) * EXPERIMENT_PARAMETERS["UNIT_SPATIAL_METER"]) / x_distance), 4)
    y_unit_degree = round((((EXPERIMENT_PARAMETERS["AOI"][3] - EXPERIMENT_PARAMETERS["AOI"][1]) * EXPERIMENT_PARAMETERS["UNIT_SPATIAL_METER"]) / y_distance), 4)
    logger.debug("X unit in degree: %s degrees, Y unit in degree: %s degrees", x_unit_degree, y_unit_degree)
    x_size = int((EXPERIMENT_PARAMETERS["AOI"][2] - EXPERIMENT_PARAMETERS["AOI"][0]) // x_unit_degree) + 1
    y_size = int((EXPERIMENT_PARAMETERS["AOI"][3] - EXPERIMENT_PARAMETERS["AOI"][1]) // y_unit_degree) + 1
    logger.info("X size: %s", x_size)
    logger.info("Y size: %s", y_size)
    logger.info("Size of spatial index: %s", x_size * y_size)
    # t_start = datetime.datetime.strptime(timestart, '%Y-%m-%d %H:%M:%S')
    # t_end = datetime.datetime.strptime(timeend, '%Y-%m-%d %H:%M:%S')
    # t_size = round((t_end - t_start) / datetime.timedelta(minutes=unit_temporal))
    # logger.info("T size: %s", t_size)
    # logger.info("Spatiotemporal units: %s", [t_size, x_size, y_size, num_topic])
    spatial_index = [x_unit_degree, y_unit_degree, x_size, y_size]
    return spatial_index


def load_csv_files_to_dataframe(DATA_DIR, EXPERIMENT_PARAMETERS):
    timestart = datetime.strptime(EXPERIMENT_PARAMETERS['TIMESTART'], '%Y-%m-%d %H:%M:%S')
    timeend = datetime.strptime(EXPERIMENT_PARAMETERS['TIMEEND'], '%Y-%m-%d %H:%M:%S')
    days = (timeend - timestart).days
    prediction_days = days + 1
    df_raw_all = pandas.DataFrame(numpy.empty(0, dtype=[('uid', 'str'), ('time_start', 'str'), ('time_end', 'str'), ('x', 'float64'), ('y', 'float64'), ('mode', 'str')]))
    num_user_all = 0

    for root, dirs, files in os.walk(DATA_DIR):
        files.sort()
        for fn in files:
            if fn[0] != '.':
                csv_file = root + fn
                df, num_user = load_csv_to_dataframe(csv_file)
                df_raw_all = df_raw_all.append(df)
                num_user_all += num_user
                logger.info("Current number of user: %s" % num_user_all)
    # df_raw_all = df_raw_all.sort_values(by=['uid', 'time_start'])
    return df_raw_all


def load_csv_to_dataframe(CSV_FILE):
    logger.info("Loading CSV %s to dataframe" % CSV_FILE)
    headers = ['uid', 'time_start', 'time_end', 'x', "y", "mode"]
    dtype = {'uid': 'str', 'time_start': 'str', "time_end": "str", "x": "float64", 'y': 'float64', 'mode': 'str'}
    parse_dates = ['time_start', 'time_end']
    df_csv = pandas.read_csv(filepath_or_buffer=CSV_FILE, header=None, names=headers,
                             dtype=dtype, parse_dates=parse_dates, error_bad_lines=False, warn_bad_lines=True)
    # df_csv = df_csv.sort_values(by=['uid', 'timestamp'])
    # logger.info("Filtering dataframe with experiment parameters")
    # grouped = df_csv.groupby('uid')
    # logger.info("Number of uid: %s" % len(grouped))
    # logger.info("Filtering with time setting")
    df_filtered = df_csv[(df_csv['time_end'] >= EXPERIMENT_PARAMETERS["TIMESTART"]) & (df_csv['time_start'] <= EXPERIMENT_PARAMETERS["TIMEEND"])]
    grouped = df_filtered.groupby('uid')
    num_user = len(grouped)
    # logger.info("Number of uid after temporal filter: %s" % num_user)
    return df_filtered, num_user


DATA_DIR = s.DATA_DIR
DATA_DIR_PROCESSED = s.DATA_DIR_PROCESSED
TEST_CSV_FILTERED = s.TEST_CSV_FILTERED
GPS_FILTERED = s.GPS_FILTERED
EXPERIMENT_PARAMETERS = s.EXPERIMENT_PARAMETERS
X_COORDINATE_FILE = s.X_COORDINATE_FILE
Y_COORDINATE_FILE = s.Y_COORDINATE_FILE
X_GRID_FILE = s.X_GRID_FILE
Y_GRID_FILE = s.Y_GRID_FILE

temporal_index = define_temporal_index(EXPERIMENT_PARAMETERS)
# print(temporal_index)
spatial_index = define_spatial_index(EXPERIMENT_PARAMETERS)
# print(spatial_index)

df_all_users = load_csv_files_to_dataframe(DATA_DIR, EXPERIMENT_PARAMETERS)

2017-07-29 22:59:56,435 - __main__ - INFO - Initializing __main__
2017-07-29 22:59:56,455 - settings - INFO - Initializing settings
2017-07-29 22:59:56,456 - settings - INFO - EXPERIMENT_NAME: Experiment_20120725_20120725_5mins_1000users_2h_2layers
2017-07-29 22:59:56,457 - settings - INFO - EXPERIMENT_DIR: /Users/koitaroh/Documents/Data/Experiments/Experiment_20120725_20120725_5mins_1000users_2h_2layers/
2017-07-29 22:59:56,458 - settings - INFO - EXPERIMENT PARAMETERS: {'EXPERIMENT_NAME': 'Experiment_20120725_20120725_5mins_1000users_2h_2layers', 'TIMESTART': '2012-07-25 00:00:00', 'TIMEEND': '2012-07-25 23:59:59', 'AOI': [138.72, 34.9, 140.87, 36.28], 'UNIT_TEMPORAL': 5, 'UNIT_SPATIAL_METER': 1000, 'MOVILITY_CHECK_WINDOW': 30, 'PREDICTION_INPUT_LENGTH': 12, 'RECALL_LENGTH': 12, 'PREDICTION_OUTPUT_LENGTH': 12, 'SAMPLE_USER_SIZE': 50, 'SAMPLE_SIZE': 100, 'VISUALIZATION_SAMPLE_SIZE': 10}
2017-07-29 22:59:56,461 - __main__ - INFO - Defining temporal indices
2017-07-29 22:59:56,465 - __m

In [7]:
logger.info("Applying slice to users")
df_users_regularized = pandas.DataFrame(numpy.empty(0, dtype=[('uid', 'int32'), ('t_index', 'int32'), ('x', 'float64'), ('y', 'float64'), ('mode', 'str'), ('s_index', 'int32')]))
# print(df_users_regularized)
headers = ['uid', 'timestamp', 'x', 'y', "mode", 's_index']
# df_users_regularized = pandas.DataFrame(columns=headers)
# print(df_users_filtered.dtypes)
grouped = df_all_users.groupby('uid')
sample_user_size = EXPERIMENT_PARAMETERS['SAMPLE_USER_SIZE']
num_user = len(grouped)
if sample_user_size <= num_user:
    num_user = sample_user_size



2017-07-30 13:14:11,667 - __main__ - INFO - Applying slice to users


In [3]:
df_all_users.dtypes

uid                   object
time_start    datetime64[ns]
time_end      datetime64[ns]
x                    float64
y                    float64
mode                  object
dtype: object

In [5]:
len(target_row)

NameError: name 'target_row' is not defined

In [41]:
temporal_index

[[0, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 0)],
 [1, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 5)],
 [2, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 10)],
 [3, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 15)],
 [4, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 20)],
 [5, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 25)],
 [6, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 30)],
 [7, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 35)],
 [8, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 40)],
 [9, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 45)],
 [10, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 50)],
 [11, 2012, 30, 3, datetime.datetime(2012, 7, 25, 0, 55)],
 [12, 2012, 30, 3, datetime.datetime(2012, 7, 25, 1, 0)],
 [13, 2012, 30, 3, datetime.datetime(2012, 7, 25, 1, 5)],
 [14, 2012, 30, 3, datetime.datetime(2012, 7, 25, 1, 10)],
 [15, 2012, 30, 3, datetime.datetime(2012, 7, 25, 1, 15)],
 [16, 2012, 30, 3, datetime.datetime(2012, 7, 25, 1, 20)],
 [17, 2012,

In [9]:
def convert_raw_coordinates_to_spatial_index(EXPERIMENT_PARAMETERS, spatial_index, x, y):
    x_index = int((x - EXPERIMENT_PARAMETERS["AOI"][0]) // spatial_index[0])
    y_index = int((y - EXPERIMENT_PARAMETERS["AOI"][1]) // spatial_index[1])
    # print(x_index)
    # print(y_index)
    spatial_index_number = x_index + (spatial_index[3] * y_index)
    return spatial_index_number

In [20]:
for name, group in grouped:
    traj = []
    if num_user == 0:
        break
    else:
        # print(name)
        # print(group)
        # For each user and each temporal index, find a location and mode to make regualized file
        for t_index in temporal_index:
            target_time = t_index[4]
            target_row = group.loc[(group['time_start'] <= target_time) & (group['time_end'] >= target_time)]
            if (len(target_row) > 0):
                x = target_row.at['x']
                y = target_row.at['y']
                mode = target_row.at['mode']
                s_index = convert_raw_coordinates_to_spatial_index(EXPERIMENT_PARAMETERS, spatial_index, x, y)
                traj.append([name, t_index[0], x, y, mode, s_index])
        df_user_regularized = pandas.DataFrame.from_records(traj, columns=headers)
#         print(df_user_regularized)
        df_users_regularized = df_users_regularized.append(df_user_regularized)
        num_user -= 1
# print(df_users_regularized)

array([[  35.83439243,  138.97597443],
       [  35.82211421,  139.0025391 ]])

array([[-0.01227822,  0.02656467]])

In [20]:
means = numpy.mean(diffs, axis=0)
means

array([-0.01227822,  0.02656467])

In [26]:
last = X_test[0, -1, -2:]
last

array([  35.82211421,  139.0025391 ])

In [35]:
numpy.add(last_diff, last)

array([[  35.80983599,  139.02910377]])

In [29]:
y_test[0]

array([  35.80983599,  139.02910377])

In [40]:
x_two_step = X_test[:, -2:, -2:]
x_two_step

array([[[  35.83439243,  138.97597443],
        [  35.82211421,  139.0025391 ]],

       [[  35.64696784,  139.53012587],
        [  35.64826051,  139.53204177]],

       [[  35.56753905,  139.7293991 ],
        [  35.5746431 ,  139.73400672]],

       ..., 
       [[  35.58541198,  139.7480611 ],
        [  35.58525307,  139.74703865]],

       [[  35.62086167,  139.71916685],
        [  35.62211741,  139.7202787 ]],

       [[  35.74634982,  139.72499071],
        [  35.74519885,  139.72670361]]])

In [42]:
last_diff = numpy.diff(x_two_step, axis=1)
last_diff

array([[[-0.01227822,  0.02656467]],

       [[ 0.00129267,  0.0019159 ]],

       [[ 0.00710406,  0.00460762]],

       ..., 
       [[-0.00015891, -0.00102245]],

       [[ 0.00125574,  0.00111185]],

       [[-0.00115097,  0.00171289]]])

In [48]:
last = X_test[:, -1:, -2:]
# last
prediction = numpy.add(last_diff, last)
prediction.shape

(1205, 1, 2)

In [51]:
prediction_shape = prediction.shape
prediction =prediction.reshape(prediction_shape[0], prediction_shape[2])
prediction

array([[  35.80983599,  139.02910377],
       [  35.64955318,  139.53395767],
       [  35.58174716,  139.73861433],
       ..., 
       [  35.58509415,  139.74601619],
       [  35.62337314,  139.72139055],
       [  35.74404788,  139.7284165 ]])

In [47]:
y_test

array([[  35.80983599,  139.02910377],
       [  35.64954332,  139.53396828],
       [  35.58246042,  139.73640138],
       ..., 
       [  35.58496666,  139.74602346],
       [  35.62358498,  139.72103667],
       [  35.74384336,  139.72816329]])