In [1]:
from bjPOI import read_csv
from datetime import datetime
from datetime import date as dt
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import math
import os

MATCHING_DISTANCE = 0.1
MATCHING_TIME_FRAME = 3

def matchTime(t1, t2):
	return abs(datetime.fromtimestamp(t1).hour - datetime.fromtimestamp(t2).hour)<= MATCHING_TIME_FRAME # time difference no greater than 3 hours
	
def match(gps1, gps2):
    lat1, lon1, t1 = gps1
    lat2, lon2, t2 = gps2
    coord1 = (lat1,lon1)
    coord2 = (lat2,lon2)

    r = 6371
    dlat = (lat2-lat1) * (math.pi/180)
    dlon = (lon2-lon1) * (math.pi/180)

    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(lat1 * (math.pi/180)) * math.cos(lat2 * (math.pi/180)) * math.sin(dlon/2) * math.sin(dlon/2)

    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = r * c

    return d <= MATCHING_DISTANCE and matchTime(t1, t2)

def trajectory_by_day(fname):
	trajectories = {}
	for i in range(1,8):
		trajectories[i] = []
	trajectory = read_csv(fname)
	print("Read CSV file")
	start = False
	last_date = dt.min
	last_day = 0
	for data_tup in trajectory:
		day = datetime.fromtimestamp(data_tup[2]).isoweekday()
		if day == last_day:
			date = dt.fromtimestamp(data_tup[2])
			if date == last_date:
				trajectories[day][-1].append(data_tup)
			else:
				one_trajectory = [data_tup]
				trajectories[day].append(one_trajectory)
				last_date = date
		else:
			date = dt.fromtimestamp(data_tup[2])
			one_trajectory = [data_tup]
			trajectories[day].append(one_trajectory)
			last_date = date
			last_day = day

	return trajectories

In [2]:
# match +5, mismatch -3, gap -2
def EDR(t1, t2, mt, ms, gap):
	m, n = len(t1), len(t2)
	matcher = [[0 for x in range(n+1)] for y in range(m+1)]
	backtrack = [[(-1,-1) for x in range(n+1)] for y in range(m+1)]
	maxscore = 0
	endposition = (0, 0)
	for i in range(1, m+1):
		for j in range(1, n+1):
			gapAbove = matcher[i-1][j]+gap
			gapLeft = matcher[i][j-1]+gap
			matching = matcher[i-1][j-1] + (mt if match(t1[i-1],t2[j-1]) else ms)
			score = max(0, max(gapAbove, gapLeft, matching)) # if negative, convert to 0
			matcher[i][j] = score

			# set backtrack table
			if score == matching:
				backtrack[i][j] = (i-1, j-1)
			elif score == gapAbove:
				backtrack[i][j] = (i-1, j)
			elif score == gapLeft:
				backtrack[i][j] = (i, j-1)
			else:
				backtrack[i][j] = (-1, -1)

			# update maxscore
			if score > maxscore:
				maxscore = score
				endposition = (i, j)

	tempi, tempj = endposition
	while backtrack[tempi][tempj] != (-1, -1):
		tempi, tempj = backtrack[tempi][tempj]

	# calculate time duration of both subtrajectories
# 	start1 = datetime.fromtimestamp(t1[tempi][2]).isoformat()
# 	end1 =datetime.fromtimestamp(t1[endposition[0]-1][2]).isoformat()
# 	start2 = datetime.fromtimestamp(t2[tempj][2]).isoformat()
# 	end2 =datetime.fromtimestamp(t2[endposition[1]-1][2]).isoformat()
# 	timeduration1 = t1[endposition[0]-1][2] - t1[tempi][2] # in seconds
# 	timeduration2 = t2[endposition[1]-1][2] - t2[tempj][2] # in seconds
	points1 = endposition[0]-tempi # number of points
	points2 = endposition[1]-tempj # number of points
	expectedScore = min(points1, points2) * mt

	return (maxscore, expectedScore)

In [3]:
# return a list of (min_duration, score) tuples
def EDR_all(traj_dict, day, mt, ms, gap):
    trajectories = traj_dict[day]
    expectedScore_score_set = []
    for i in range(len(trajectories)-1):
        for j in range(i+1, len(trajectories)-1):
            score, expectedScore = EDR(trajectories[i], trajectories[j], mt, ms, gap)
            expectedScore_score_set.append((expectedScore,score))

    return expectedScore_score_set

In [4]:
def exp_score_vs_score(data):
    new_data = [(d[0],d[1]/d[0]) if d[0]!=0 else (d[0], 0) for d in data]
    unzipped_data = list(zip(*new_data))
    x,y = np.asarray(unzipped_data)
#     slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
#     plt.plot(x, y, 'o', label='original data')
#     plt.plot(x, intercept + slope*x, 'r', label='fitted line')
    hist= plt.hist(y,bins=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
    plt.legend()
    plt.show()
#     print("hist: %f    bin_edges: %f" % (hist, bin_edge))
    return hist

In [5]:
# traj_dict0 = trajectory_by_day("../Data/DataByPerson/000.csv")
# traj_dict1 = trajectory_by_day("../Data/DataByPerson/001.csv")
# traj_dict2 = trajectory_by_day("../Data/DataByPerson/002.csv")
traj_dict3 = trajectory_by_day("../Data/DataByPerson/003.csv")
# traj_dict4 = trajectory_by_day("../Data/DataByPerson/004.csv")
# traj_dict5 = trajectory_by_day("../Data/DataByPerson/005.csv")

Read CSV file


In [None]:
monday0 = EDR_all(traj_dict0, 1, 5, -3, -2)
tuesday0 = EDR_all(traj_dict0, 2, 5, -3, -2)
wednesday0 = EDR_all(traj_dict0, 3, 5, -3, -2)
thursday0 = EDR_all(traj_dict0, 4, 5, -3, -2)
friday0 = EDR_all(traj_dict0, 5, 5, -3, -2)
saturday0 = EDR_all(traj_dict0, 6, 5, -3, -2)
sunday0 = EDR_all(traj_dict0, 7, 5, -3, -2)

In [None]:
monday1 = EDR_all(traj_dict1, 1, 5, -3, -2)
tuesday1 = EDR_all(traj_dict1, 2, 5, -3, -2)
wednesday1 = EDR_all(traj_dict1, 3, 5, -3, -2)
thursday1 = EDR_all(traj_dict1, 4, 5, -3, -2)
friday1 = EDR_all(traj_dict1, 5, 5, -3, -2)
saturday1 = EDR_all(traj_dict1, 6, 5, -3, -2)
sunday1 = EDR_all(traj_dict1, 7, 5, -3, -2)

In [None]:
monday2 = EDR_all(traj_dict2, 1, 5, -3, -2)
tuesday2 = EDR_all(traj_dict2, 2, 5, -3, -2)
wednesday2 = EDR_all(traj_dict2, 3, 5, -3, -2)
thursday2 = EDR_all(traj_dict2, 4, 5, -3, -2)
friday2 = EDR_all(traj_dict2, 5, 5, -3, -2)
saturday2 = EDR_all(traj_dict2, 6, 5, -3, -2)
sunday2 = EDR_all(traj_dict2, 7, 5, -3, -2)

In [6]:
monday3 = EDR_all(traj_dict3, 1, 5, -3, -2)
tuesday3 = EDR_all(traj_dict3, 2, 5, -3, -2)
wednesday3 = EDR_all(traj_dict3, 3, 5, -3, -2)
thursday3 = EDR_all(traj_dict3, 4, 5, -3, -2)
friday3 = EDR_all(traj_dict3, 5, 5, -3, -2)
saturday3 = EDR_all(traj_dict3, 6, 5, -3, -2)
sunday3 = EDR_all(traj_dict3, 7, 5, -3, -2)

MemoryError: 

In [None]:
monday3 = EDR_all(traj_dict3, 1, 5, -3, -2)
exp_score_vs_score(monday3)

In [None]:
tuesday3 = EDR_all(traj_dict3, 2, 5, -3, -2)
exp_score_vs_score(tuesday3)

In [None]:
wednesday3 = EDR_all(traj_dict3, 3, 5, -3, -2)
exp_score_vs_score(wednesday3)

In [None]:
thursday3 = EDR_all(traj_dict3, 4, 5, -3, -2)
exp_score_vs_score(thursday3)

In [None]:
friday3 = EDR_all(traj_dict3, 5, 5, -3, -2)
exp_score_vs_score(friday3)

In [None]:
saturday3 = EDR_all(traj_dict3, 6, 5, -3, -2)
exp_score_vs_score(saturday3)

In [None]:
sunday3 = EDR_all(traj_dict3, 7, 5, -3, -2)
exp_score_vs_score(sunday3)

In [None]:
monday4 = EDR_all(traj_dict4, 1, 5, -3, -2)
tuesday4 = EDR_all(traj_dict4, 2, 5, -3, -2)
wednesday4 = EDR_all(traj_dict4, 3, 5, -3, -2)
thursday4 = EDR_all(traj_dict4, 4, 5, -3, -2)
friday4 = EDR_all(traj_dict4, 5, 5, -3, -2)
saturday4 = EDR_all(traj_dict4, 6, 5, -3, -2)
sunday4 = EDR_all(traj_dict4, 7, 5, -3, -2)

In [None]:
monday5 = EDR_all(traj_dict5, 1, 5, -3, -2)
tuesday5 = EDR_all(traj_dict5, 2, 5, -3, -2)
wednesday5 = EDR_all(traj_dict5, 3, 5, -3, -2)
thursday5 = EDR_all(traj_dict5, 4, 5, -3, -2)
friday5 = EDR_all(traj_dict5, 5, 5, -3, -2)
saturday5 = EDR_all(traj_dict5, 6, 5, -3, -2)
sunday5 = EDR_all(traj_dict5, 7, 5, -3, -2)

In [None]:
# draw scatter plot of one person one day
def scatter(x,y,title,plotout):
    plt.scatter(x,y)
    plt.title(title)
    plt.xlabel('Duration')
    plt.ylabel('Score')
    plt.savefig(plotout)

In [None]:
def slope_intercept(x1, y1, x2, y2):
    a = (y2 - y1) / (x2 - x1)
    b = y1 - a * x1     
    return a,b

In [None]:
def one_slope_intercept(data):
#     first = (float('inf'),float('inf'))
#     last = (1,0)
#     for duration, score in data:
#         if duration < first[0]:
#             first = duration, score
#         if score/duration > last[1]/last[0]:
#             last = duration, score
#     print(first,last)
#     slope, intercept= slope_intercept(first[0], first[1], last[0], last[1])
    new_data = [(d[0],d[1]/d[0]) for d in data if d[0] > 0 and d[1] > 0]
    unzipped_data = list(zip(*new_data))
    x,y = np.asarray(unzipped_data)
    plt.plot(x, y, 'o', label='original data')
#     plt.plot(x, intercept + slope*x, 'r', label='simulated line')
    plt.legend()
    plt.show()
#     print("slope: %f    intersect: %f" %(slope, intercept))
#     return slope, intercept

In [None]:
one_slope_intercept(monday0)

In [None]:
one_slope_intercept(tuesday0)

In [None]:
one_slope_intercept(wednesday0)

In [None]:
one_slope_intercept(thursday0)

In [None]:
one_slope_intercept(friday0)

In [None]:
one_slope_intercept(saturday0)

In [None]:
one_slope_intercept(sunday0)

In [None]:
def calculate_one_slope(data):
#     data.sort(key=lambda pair: pair[0])
    cleaned_data = [d for d in data[:-(5* (len(data) // 10))] if d[0] != 0 and d[1] != 0]
    unzipped_data = list(zip(*cleaned_data))
    x,y = np.asarray(unzipped_data)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    plt.plot(x, y, 'o', label='original data')
    plt.plot(x, intercept + slope*x, 'r', label='fitted line')
    plt.legend()
    plt.show()
    print("slope: %f    intercept: %f    r_value: %f" % (slope, intercept, r_value))
    return slope,intercept

In [None]:
calculate_one_slope(monday0)

In [None]:
def determine_slope_one_person(traj_dict):
    result = 0
    for i in range(1, 8):
        result+=calculate_one_slope(EDR_all(traj_dict, i, 5, -3, -2))
    return result/7

In [None]:
def determine_slope():
    directory = "../Data/DataByPerson"
    considered = 0
    slope = 0
    d_slope = float('inf')
    for f in os.listdir(directory):
        if d_slope >= 0.05:
            fname = directory + "/" + f
            traj_dict = trajectory_by_day(fname)
            new_slope = determine_slope_one_person(traj_dict)
            new_average = (slope * considered + new_slope)/(considered+1)
            considered+=1
            d_slope = abs(new_average-slope)
            slope = new_average
            print(new_slope)
            print(f, new_average ,d_slope)
        else:
            break
    return slope      

In [None]:
determine_slope()