# Libraries

In [3]:
import csv
import glob
import uuid
import json
from infostop import Infostop
from pymongo import MongoClient
import pandas as pd
import h3
#import googlemaps
from geopy.geocoders import Nominatim

import geopandas as gpd
import shapely
from multiprocessing import Pool
import time
import os
import random
import logging

In [33]:
geolocator = Nominatim(user_agent="myapp")
coordinates = "{}, {}".format(-11.984677099999999, -77.09265620000002)

info = geolocator.reverse(coordinates)

info.raw

{'place_id': 117738090,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 121371938,
 'lat': '-11.984585359454531',
 'lon': '-77.0926334681423',
 'display_name': 'Los Pinos Del Norte, Urb. Virgen del Rosario ll Etapa, San Martín de Porres, Lima, 15109, Perú',
 'address': {'road': 'Los Pinos Del Norte',
  'neighbourhood': 'Urb. Virgen del Rosario ll Etapa',
  'suburb': 'San Martín de Porres',
  'city': 'San Martín de Porres',
  'region': 'Lima',
  'state': 'Lima',
  'postcode': '15109',
  'country': 'Perú',
  'country_code': 'pe'},
 'boundingbox': ['-11.9847309', '-11.9843834', '-77.0932621', '-77.0920461']}

In [30]:
info.raw['address']['city']

'Comas'

# Save to mongodb

- H3: https://towardsdatascience.com/uber-h3-for-data-analysis-with-python-1e54acdcc908
- openstreet normatim https://nominatim.org/release-docs/develop/api/Reverse/

In [50]:
# client = MongoClient("mongodb://mongo.as-dell.copernic.local:27016")
# db = client["covid19"]

# Logger

In [5]:
logger = logging.getLogger('mongodb_log')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('mongodb.log', mode='w')
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(process)d | %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)

# Save to Json 

In [1]:
def parse_traj(file):
    model = Infostop()

    df = pd.read_csv(file, sep=',')
    df.columns = ["user_id", "date", "lon", "lat"]
    df["date"] = pd.to_datetime(df.date)
    df.sort_values(by="date", inplace=True)
    df["ts"] = (df["date"] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

    traj = df[["lat", "lon", "ts"]].values
    model = Infostop(r1=50, r2=50, min_staying_time=200)
    labels = model.fit_predict(traj)
    df["label"] = labels
    logger.info("Parse trajectorie OK | {}".format(file))
    return df

def create_record(df, use_geocoding=False):
    geolocator = Nominatim(user_agent="myapp")

    record = {}
    record["_id"] = df["user_id"].iloc[0]
    location = []
    idx = 0
    prev_coordinates = ""
    for k,row in df.iterrows():
        
        loc = {
                "index": idx, 
                "date": str(row["date"]), 
                "lat": row["lat"], 
                "lon": row["lon"], 
                "label": row["label"],
                "h3_10" : h3.geo_to_h3(row["lat"], row["lon"], 10),
                "h3_9" : h3.geo_to_h3(row["lat"], row["lon"], 9),
                "h3_8" : h3.geo_to_h3(row["lat"], row["lon"], 8),
                "h3_7" : h3.geo_to_h3(row["lat"], row["lon"], 7),
                "h3_6" : h3.geo_to_h3(row["lat"], row["lon"], 6)
        }
        
        if use_geocoding:
            coordinates = "{}, {}".format(row["lat"], row["lon"])
            if prev_coordinates == coordinates:
                info = prev_info
            else:
                info = geolocator.reverse(coordinates)
            
            if "road" in info.raw["address"].keys(): 
                loc["road"] = info.raw["address"]["road"]
            if "city" in info.raw["address"].keys():
                loc["city"] = info.raw["address"]["city"]
            if "region" in info.raw["address"].keys():
                loc["region"] = info.raw["address"]["region"]
            if "shop" in info.raw["address"].keys():
                loc["shop"] = info.raw["address"]["shop"]
                
            prev_coordinates = coordinates
            prev_info = info
       
        location.append(loc)
        idx += 1
    record["location"] = location
    return record

def preProcessFile(file):
    tmp = pd.read_csv(file).sort_values('datetime').dropna()
    cond = (abs(tmp.x)<=180) & (abs(tmp.y)<=90)
    tmp[cond].to_csv(file, index=False)

def processJson (file, saveFile=False):
    filesPath = 'Data/devices_list' #path for csv files
    jsonPath = 'Data/devices_json' #path for json files
    filepath = os.path.join(filesPath, file)
    try:
#         preProcessFile(filepath) #pre process to quit Nan records and validate coordinates
        df = parse_traj(filepath)
        record = create_record(df, use_geocoding=False)
        if saveFile:
            fileJson = file.replace('.csv', '.json')
            fileJsonPath = os.path.join(jsonPath, fileJson )
            with open(fileJsonPath, 'w') as fp:
                json.dump(record, fp, indent=3)
                logger.info("File OK | {}".format(file))
                return fileJsonPath
        return record
    except Exception as e:
        logger.error("Error in file: | {} | {}".format(file, e))

In [261]:
files = os.listdir("Data/devices_list")

jsons = os.listdir("Data/devices_json")
jsons = [x.replace('.json', '.csv') for x in jsons]


toProcess = list(set(files) - set(jsons))
print(len(toProcess))
toProcess[:10]

In [260]:
##Main execution using multiprocessing
if __name__ == '__main__':
    ts = time.time()
    pool = Pool(os.cpu_count())                         # Create a multiprocessing Pool

    files = toProcess #list of file to be processed

    logger.info("Total files: {}".format(len(files)))
    
    pool.map(processJson, files)
    logger.info('Process finished in {} seconds'.format(time.time() - ts))

# Others test

In [8]:
#Parse Trajectorie
file = 'Data/devices_list/FILEe98001c8-56a0-4b71-9aa3-196132244373.csv'
# df = parse_traj(tmp)

model = Infostop()

df = pd.read_csv(file, sep=',')
df.columns = ["user_id", "date", "lon", "lat"]
df["date"] = pd.to_datetime(df.date)
df.sort_values(by="date", inplace=True)
df["ts"] = (df["date"] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

traj = df[["lat", "lon", "ts"]].values
model = Infostop(r1=50, r2=50, min_staying_time=200)
# model = Infostop()
labels = model.fit_predict(traj)
df["label"] = labels
logger.info("Parse trajectorie OK | {}".format(file))

df.sort_values('date')

Unnamed: 0,user_id,date,lon,lat,ts,label
0,e98001c8-56a0-4b71-9aa3-196132244373,2020-04-04 18:02:24,-73.256790,-3.730481,1586023344,-1
1,e98001c8-56a0-4b71-9aa3-196132244373,2020-04-06 14:01:04,-73.256836,-3.730478,1586181664,1
2,e98001c8-56a0-4b71-9aa3-196132244373,2020-04-06 14:03:53,-73.256807,-3.730466,1586181833,1
3,e98001c8-56a0-4b71-9aa3-196132244373,2020-04-06 14:04:02,-73.256807,-3.730466,1586181842,1
4,e98001c8-56a0-4b71-9aa3-196132244373,2020-04-06 14:18:32,-73.256811,-3.730456,1586182712,1
...,...,...,...,...,...,...
947,e98001c8-56a0-4b71-9aa3-196132244373,2020-10-15 04:43:04,-73.256761,-3.730484,1602736984,-1
948,e98001c8-56a0-4b71-9aa3-196132244373,2020-10-15 11:47:52,-73.253811,-3.724790,1602762472,-1
949,e98001c8-56a0-4b71-9aa3-196132244373,2020-10-15 11:48:42,-73.246045,-3.723175,1602762522,-1
950,e98001c8-56a0-4b71-9aa3-196132244373,2020-10-15 11:51:56,-73.245912,-3.713858,1602762716,-1


In [31]:
%%time
rec = []
for file in files[:10]:
    df = parse_traj(file)
    # geoconding could be time consumming
    record = create_record(df, use_geocoding=False)
    #db.user_locations.insert_one(record)
    rec.append(record)

CPU times: user 9.28 s, sys: 403 ms, total: 9.69 s
Wall time: 11.7 s
