In [None]:
import pickle 
import numpy as np
import os
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
from os.path import join as pjoin
import geopandas as gpd
from cartoframes.viz import *
from shapely.geometry import MultiPoint, Point
import datetime
import osmnx as ox
from core.urban_osm import UrbanOSM
from gensim.models import Word2Vec

In [None]:
from utils import *

In [None]:
# Set postfix for the saved dataset filename
POSTFIX = 'ua' 
DATASET_ROOT = '../dataset/'
DATASET = 'metr-la'
RESULT_ROOT = pjoin('../results/', DATASET)
OSM_FILE_PATH = pjoin(RESULT_ROOT, 'osm_graph', 'drive.graphml')

assert DATASET in ['metr-la', 'pems-bay', 'pemsd7']

In [None]:
if not os.path.exists(RESULT_ROOT):
    os.makedirs(RESULT_ROOT)
if not os.path.exists(pjoin(RESULT_ROOT, 'osm_graph')):
    os.makedirs(pjoin(RESULT_ROOT, 'osm_graph'))

In [None]:
data_df, sensor_ids, sensor_df, sensor_id_to_ind, adj_mx = load_dataset(DATASET_ROOT, DATASET)
num_sensors = len(sensor_ids)
ind_to_sensor_id = {v:k for k, v in sensor_id_to_ind.items()}

print('Dataset Loaded')

In [None]:
print('Save original Adjacency Matrix: ', np.count_nonzero(adj_mx))
org_fname = pjoin(RESULT_ROOT, 'original_adj_mx.pkl')
if not os.path.isfile(org_fname):
    with open(org_fname, 'wb') as f:
        pickle.dump([sensor_ids, sensor_id_to_ind, adj_mx], f, protocol=2)

## Setup

### Initialize OSM with datset
This might take few minutes.

In [None]:
urbanosm = UrbanOSM(sensor_df, OSM_FILE_PATH)

In [None]:
urbanosm.match_sensors()

In [None]:
urbanosm.setup_graph()

In [None]:
urbanosm.navigate_example()

In [None]:
urbanosm.setup_grid()

In [None]:
basemap = {
    'style': 'mapbox://styles/mapbox/streets-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}
urbanosm.navigate_example(basemap=basemap, show_grid=True)

In [None]:
urbanosm.generate_paths(pjoin(RESULT_ROOT, f'generated_paths_{POSTFIX}.txt'))

In [None]:
urbanosm.setup_sensor_words()
urbanosm.setup_path_sentences()

### Save as geojson

In [None]:
urbanosm.save_geojson(pjoin(RESULT_ROOT))

## Adjacency Matrices

### Node2Vec Similarity

In [None]:
n2v_fname = pjoin(RESULT_ROOT, f'n2v_sim_{POSTFIX}.pkl')
if os.path.isfile(n2v_fname):
    with open(n2v_fname, 'rb') as f:
        _, _, sim_array = pickle.load(f)
    print('Loaded from '+n2v_fname)
else:
    vector_size = 64
    sentences = [sent.split() for sent in urbanosm.path_sentences]
    model = Word2Vec(sentences, window=7, min_count=1, workers=4, vector_size=vector_size)

    import numpy as np
    wv_array = []
    for sid in data_df.columns:
        q = f'S{sid}'
        if q in model.wv:
            wv_array.append(model.wv[q])
        else:
            wv_array.append(np.zeros(vector_size))
            
    wv_array = np.array(wv_array)

    def cosine_similarity(vector1, vector2):
        dot_product = np.dot(vector1, vector2)
        if dot_product == 0:
            return -1
        magnitude1 = np.linalg.norm(vector1)
        magnitude2 = np.linalg.norm(vector2)
        cosine_similarity = dot_product / (magnitude1 * magnitude2)
        return cosine_similarity

    sim_array = np.eye(num_sensors)
    for i in range(wv_array.shape[0]):
        for j in range(i+1, wv_array.shape[0]):
            sim_array[j, i] = sim_array[i, j] = cosine_similarity(wv_array[i], wv_array[j])

    with open(n2v_fname, 'wb') as f:
        pickle.dump([sensor_ids, sensor_id_to_ind, sim_array], f, protocol=2)

### Co-orccurence Matrix

In [None]:
cooccur_fname = pjoin(RESULT_ROOT, f'cooccur_sim_{POSTFIX}.pkl')
if os.path.isfile(cooccur_fname):
    with open(cooccur_fname, 'rb') as file:
        _, _, cooccur_matrix = pickle.load(file)
    print('Loaded from '+cooccur_fname)
else:
    sentences = urbanosm.path_sentences

    co_occurrence_vectors = pd.DataFrame(
        np.zeros([len(sensor_ids), len(sensor_ids)]),
        index = ['S'+s for s in sensor_ids],
        columns = ['S'+s for s in sensor_ids]
    )

    word_count = dict()
    word_co_occur = dict()
    for sent in tqdm.tqdm(sentences):
        ext_sent = [w for w in sent.split() if w[0] == 'S']
        for i, w in enumerate(ext_sent):
            word_count.setdefault(w, 0)
            co_occurrence_vectors.loc[w, w] +=1
            
        for w2 in ext_sent[i+1:]:
                if w != w2:
                    co_occurrence_vectors.loc[w, w2] += 1
                    co_occurrence_vectors.loc[w2, w] += 1

    cooccur_matrix = np.eye(num_sensors)

    for i in range(num_sensors):
        for j in range(i, num_sensors):
            w = 'S'+ind_to_sensor_id[i]
            w2 = 'S'+ind_to_sensor_id[j]
            wc1 = co_occurrence_vectors.loc[w, w]
            wc2 = co_occurrence_vectors.loc[w2, w2]
            cooccur_matrix[j, i] = cooccur_matrix[i, j] = co_occurrence_vectors.loc[w2, w]/((wc1*wc2)**.5+1)
    
    with open(f'{DATASET}/cooccur_sim_{POSTFIX}.pkl', 'wb') as f:
        pickle.dump([sensor_ids, sensor_id_to_ind, cooccur_matrix], f, protocol=2)

### Reachable Distance Matrix

In [None]:
dist_fname = pjoin(RESULT_ROOT, f'dist_meters_{POSTFIX}.pkl')
if os.path.isfile(dist_fname):
    with open(dist_fname, 'rb') as file:
        _, _, dist_mat = pickle.load(file)
    print('Loaded from '+dist_fname)
else:
    sid_dist_dict = dict()

    for path_sentence in tqdm.tqdm(urbanosm.path_sentences):
        co_sensors = [node for node in path_sentence.split() if node[0] == 'S']
        for i, sid1 in enumerate(co_sensors[:-1]):
            sid2 = co_sensors[i+1]
            if sid1 in sid_dist_dict and sid2 in sid_dist_dict[sid1]:
                continue
            track_paths = urbanosm.track_path(path_sentence, sid1, sid2)
            between_sid_dist = urbanosm.sid_dist(sid1, sid2, track_paths)
            sid_dist_dict.setdefault(sid1, dict())
            sid_dist_dict[sid1][sid2] = between_sid_dist
        
        for i, sid in enumerate(co_sensors[:-1]):
            cum_dist = 0
            psid = sid
            for qsid in co_sensors[i+1:]:
                cum_dist += sid_dist_dict[psid][qsid]
                psid = qsid
                if sid in sid_dist_dict and qsid in sid_dist_dict[sid]:
                    continue
                else:
                    sid_dist_dict[sid][qsid] = cum_dist
    
    dist_mat = np.zeros((len(sensor_ids), len(sensor_ids)))
    dist_mat.fill(np.inf)
    np.fill_diagonal(dist_mat, 0)
    for k1 in sid_dist_dict:
        for k2 in sid_dist_dict[k1]:        
            ii = sensor_id_to_ind[k1[1:]]
            jj = sensor_id_to_ind[k2[1:]]
            val = sid_dist_dict[k1][k2]
            dist_mat[jj, ii] = val
    
    with open(dist_fname, 'wb') as f:
        pickle.dump([sensor_ids, sensor_id_to_ind, dist_mat], f, protocol=2)

### Finalize

In [None]:
dist_mat[dist_mat > MILE_TO_METER*80] = np.inf
dist_vals_meters = dist_mat[~np.isinf(dist_mat)].flatten()
dist_sigma = 5*MILE_TO_METER
dist_normed = np.exp(-np.square(dist_mat / dist_sigma))
final_adj_mx = dist_normed*cooccur_matrix

normed_fname = pjoin(RESULT_ROOT, f'new_dist_sim_{POSTFIX}.pkl')
final_fname = pjoin(RESULT_ROOT, f'urban_activity_sim_{POSTFIX}.pkl')

if not os.path.isfile(normed_fname):
    with open(normed_fname, 'wb') as f:
        pickle.dump([sensor_ids, sensor_id_to_ind, dist_normed], f, protocol=2)

if not os.path.isfile(final_fname):   
    with open(final_fname, 'wb') as f:
        pickle.dump([sensor_ids, sensor_id_to_ind, final_adj_mx], f, protocol=2)

### Summary

In [None]:
matrices = [adj_mx, sim_array, cooccur_matrix, dist_mat, dist_normed, final_adj_mx]
titles = ['Original', 'Node2Vec', 'Co-occurrence', 
          'Distance', 'Normalized', 'Final']
for m, t in zip(matrices, titles):
    print(f'{t}\t distance_graph_loaded', np.count_nonzero(m))

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

for ax, m, t in zip(axes.flatten(), matrices, titles):
    ax.matshow(m)
    ax.set_title(t)

plt.tight_layout()
plt.show()