In [8]:
from bjPOI import read_csv
from datetime import datetime
from datetime import date as dt
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import math
import os

MATCHING_DISTANCE = 0.1
MATCHING_TIME_FRAME = 3

def matchTime(t1, t2):
	return abs(datetime.fromtimestamp(t1).hour - datetime.fromtimestamp(t2).hour)<= MATCHING_TIME_FRAME # time difference no greater than 3 hours
	
def match(gps1, gps2):
    lat1, lon1, t1 = gps1
    lat2, lon2, t2 = gps2
    coord1 = (lat1,lon1)
    coord2 = (lat2,lon2)

    r = 6371
    dlat = (lat2-lat1) * (math.pi/180)
    dlon = (lon2-lon1) * (math.pi/180)

    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(lat1 * (math.pi/180)) * math.cos(lat2 * (math.pi/180)) * math.sin(dlon/2) * math.sin(dlon/2)

    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = r * c

    return d <= MATCHING_DISTANCE and matchTime(t1, t2)

def trajectory_by_day(fname):
	trajectories = {}
	for i in range(1,8):
		trajectories[i] = []
	trajectory = read_csv(fname)
	print("Read CSV file")
	start = False
	last_date = dt.min
	last_day = 0
	for data_tup in trajectory:
		day = datetime.fromtimestamp(data_tup[2]).isoweekday()
		if day == last_day:
			date = dt.fromtimestamp(data_tup[2])
			if date == last_date:
				trajectories[day][-1].append(data_tup)
			else:
				one_trajectory = [data_tup]
				trajectories[day].append(one_trajectory)
				last_date = date
		else:
			date = dt.fromtimestamp(data_tup[2])
			one_trajectory = [data_tup]
			trajectories[day].append(one_trajectory)
			last_date = date
			last_day = day

	return trajectories

In [2]:
# match +5, mismatch -3, gap -2
def EDR(t1, t2, mt, ms, gap):
	m, n = len(t1), len(t2)
	matcher = [[0 for x in range(n+1)] for y in range(m+1)]
	backtrack = [[(-1,-1) for x in range(n+1)] for y in range(m+1)]
	maxscore = 0
	endposition = (0, 0)
	for i in range(1, m+1):
		for j in range(1, n+1):
			gapAbove = matcher[i-1][j]+gap
			gapLeft = matcher[i][j-1]+gap
			matching = matcher[i-1][j-1] + (mt if match(t1[i-1],t2[j-1]) else ms)
			score = max(0, max(gapAbove, gapLeft, matching)) # if negative, convert to 0
			matcher[i][j] = score

			# set backtrack table
			if score == matching:
				backtrack[i][j] = (i-1, j-1)
			elif score == gapAbove:
				backtrack[i][j] = (i-1, j)
			elif score == gapLeft:
				backtrack[i][j] = (i, j-1)
			else:
				backtrack[i][j] = (-1, -1)

			# update maxscore
			if score > maxscore:
				maxscore = score
				endposition = (i, j)

	tempi, tempj = endposition
	while backtrack[tempi][tempj] != (-1, -1):
		tempi, tempj = backtrack[tempi][tempj]

	# calculate time duration of both subtrajectories
	start1 = datetime.fromtimestamp(t1[tempi][2]).isoformat()
	end1 =datetime.fromtimestamp(t1[endposition[0]-1][2]).isoformat()
	start2 = datetime.fromtimestamp(t2[tempj][2]).isoformat()
	end2 =datetime.fromtimestamp(t2[endposition[1]-1][2]).isoformat()
	timeduration1 = t1[endposition[0]-1][2] - t1[tempi][2] # in seconds
	timeduration2 = t2[endposition[1]-1][2] - t2[tempj][2] # in seconds

	return (maxscore, start1, end1, timeduration1, start2, end2, timeduration2)

In [3]:
# return a list of (min_duration, score) tuples
def EDR_all(traj_dict, day, mt, ms, gap):
    trajectories = traj_dict[day]
    duration_score_set = []
    for i in range(len(trajectories)-1):
        for j in range(i+1, len(trajectories)-1):
            score, start1, end1, duration1, start2, end2, duration2 = EDR(trajectories[i], trajectories[j], mt, ms, gap)
            duration_score_set.append((min(duration1, duration2),score))

    return duration_score_set

In [4]:
# draw scatter plot of one person one day
def scatter(x,y,title,plotout):
    plt.scatter(x,y)
    plt.title(title)
    plt.xlabel('Duration')
    plt.ylabel('Score')
    plt.savefig(plotout)

In [26]:
def calculate_one_regression(data):
    data.sort(key=lambda pair: pair[0])
    cleaned_data = [d for d in data[:-(2* (len(data) // 10))] if d[1] != 0]
    unzipped_data = list(zip(*cleaned_data))
    x,y = np.asarray(unzipped_data)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    plt.plot(x, y, 'o', label='original data')
    plt.plot(x, intercept + slope*x, 'r', label='fitted line')
    plt.legend()
    plt.show()
    print("slope: %f    r_value: %f    p_value: %f" % (slope, r_value, p_value))
    return slope

In [6]:
def determine_slope_one_person(traj_dict):
    result = 0
    for i in range(1, 8):
        result+=calculate_one_regression(EDR_all(traj_dict, i, 5, -3, -2))
    return result/7

In [27]:
def determine_slope():
    directory = "../Data/DataByPerson"
    considered = 0
    slope = 0
    d_slope = float('inf')
    for f in os.listdir(directory):
        if d_slope >= 0.05:
            fname = os.path.join(directory, f)
            traj_dict = trajectory_by_day(fname)
            print(traj_dict[1])
            new_slope = determine_slope_one_person(traj_dict)
            new_average = (slope * considered + new_slope)/(considered+1)
            considered+=1
            d_slope = abs(new_average-slope)
            slope = new_average
            print(new_slope)
            print(f, new_average ,d_slope)
        else:
            break
    return slope      

In [28]:
determine_slope()

done
Read CSV file


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



ValueError: not enough values to unpack (expected 2, got 0)