In [2]:
import numpy as np 
import pandas as pd 
import math 
from sklearn import preprocessing
import gmplot
import matplotlib.pyplot as plt
from time import time
import operator
from datetime import datetime

In [3]:
from geopy.geocoders import Nominatim
import overpass
import folium
from folium import plugins
from folium.plugins import HeatMap

In [4]:
# dataframe con device ids legibles
def device_ids_encoder(d):
    le = preprocessing.LabelEncoder()
    simple_ids=le.fit_transform(d['Device ID'].astype(str))    
    d['Device ID'] = simple_ids  
    return d

# punto medio de un usuario
def avg_location(d, user):
	j=0
	coords = [0, 0]
	for index, row in d.iterrows():
		if row['Device ID']==user:
			coords[0] += float(row['Latitude'])
			coords[1] += float(row['Longitude'])
			j += 1

	coords[0] = coords[0]/j;
	coords[1] = coords[1]/j;
	return coords

#Se obtiene un diccionario cuya clave es el user ID y el valor es un dataframe con las localizaciones del usuario
def get_info_by_user(d):
	users={}
	df = dict()
    
	for idx, row in d.iterrows():
		if row['Device ID'] not in users:
			df=d[d['Device ID']==row['Device ID']]
			users[row['Device ID']] = df
            
#Ordenamos todas las localizaciones de cada usuario en el tiempo
	for k,v in users.items():
		users[k].sort_values(by=['Time Stamp'])
		users[k].reset_index(drop=True)
            
	return users

# top n de usuarios por numero de localizaciones
def top_n_users(users, n):
	users_num_locations = {}
	users_top_n = {}
	i=0

	for k,v in users.items():
		users_num_locations[k] = len(v)

	users_num_locations = dict(sorted(users_num_locations.items(), key=operator.itemgetter(1), reverse=True))
    
	for k,v in users_num_locations.items():
		if i==n:
			break
		users_top_n[k] = users[k]
		i+=1
    
	return users_top_n

### Getting data from csv

In [5]:
start_time = time()
#-----------------------------------------------------------------
df_raw = pd.read_csv('datajulio.csv', sep = ',',header=0, index_col=0)
#-----------------------------------------------------------------
total_time = time() - start_time
print(str(total_time) + " segundos")

  mask |= (ar1 == a)


106.06460785865784 segundos


In [6]:
len(df_raw)

27226706

In [7]:
df=df_raw
df.drop_duplicates(keep=False,inplace=True) # eliminamos filas repetidas (no aportan informacion)
df.rename(columns={'offset':'Offset'}, inplace=True)
len(df)

22178388

In [8]:
print("\nDataset original:")
df.head()


Dataset original:


Unnamed: 0,Time Stamp,Device ID,OS,Latitude,Longitude,Accuracy,Offset
947210,1562020763,709c8e936b8bb1345acd529a2e905e49f86813b2a9c8c4...,0,42.249754,-8.610742,1700,7200.0
947211,1561982150,10024d1a3e5aaa32cfd9b3079e5bad8bb6aa3f15c0cf23...,0,40.44595,-3.691597,37,7200.0
947212,1561985717,e2c3c8c9456fb44ad02d5a628186049b0f5d3b60c2845e...,0,42.702406,-8.664966,15,7200.0
947213,1561998458,43617b99f4b9814252e93e97f1d155dfbaf01af8987f65...,0,39.991468,-6.538007,15,7200.0
947214,1561972468,d89369386ef6f9f5187b2469a71ec3e997e89e40e3b534...,0,39.540992,2.742054,800,7200.0


In [9]:
# encoding ids
df=device_ids_encoder(df)
df.head()

Unnamed: 0,Time Stamp,Device ID,OS,Latitude,Longitude,Accuracy,Offset
947210,1562020763,66262,0,42.249754,-8.610742,1700,7200.0
947211,1561982150,9284,0,40.44595,-3.691597,37,7200.0
947212,1561985717,133732,0,42.702406,-8.664966,15,7200.0
947213,1561998458,39698,0,39.991468,-6.538007,15,7200.0
947214,1561972468,127746,0,39.540992,2.742054,800,7200.0


In [10]:
df.head(2)

Unnamed: 0,Time Stamp,Device ID,OS,Latitude,Longitude,Accuracy,Offset
947210,1562020763,66262,0,42.249754,-8.610742,1700,7200.0
947211,1561982150,9284,0,40.44595,-3.691597,37,7200.0


## Filtrado de fechas [2019-07-01 11:00:58, 2019-08-01 23:59:59]

In [None]:
print(datetime.fromtimestamp(df['Time Stamp'].max()))
print(datetime.fromtimestamp(df['Time Stamp'].min()))
#df.nlargest(100, ['Time Stamp'])

In [12]:
date_time_str='2019-07-01 00:00:00'
date = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
timestamp_floor = datetime.timestamp(date)
#--------------------------------------------------------------
date_time_str='2019-08-01 00:00:00'
date = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
timestamp_roof = datetime.timestamp(date)
#--------------------------------------------------------------
df = df[(df['Time Stamp']>=timestamp_floor) & (df['Time Stamp']<=timestamp_roof)]
len(df)

22074533

In [14]:
len(df['Device ID'].unique())

149270

## Filtrado de usuarios por región

In [15]:
geolocator = Nominatim(user_agent="TFG")

In [43]:
region = 'Islas Baleares'
#Obtenemos los limites de la region seleccionada via Nominatim
bbox = geolocator.geocode(region, timeout=30).raw['boundingbox']
#bbox[:2] = ['40.3119774', '40.6437293']
#bbox[2:] = ['-3.8889539', '-3.5179163']
df_region = df[(df['Latitude'] >= float(bbox[:2][0])) &
               (df['Latitude'] <= float(bbox[:2][1])) &
               (df['Longitude'] >= float(bbox[2:][0])) &
               (df['Longitude'] <= float(bbox[2:][1]))]
len(df_region)

1276393

Vemos cuantos usuarios tienen más de X puntos recogidos en el mes de julio

In [45]:
users_top = []
users = list(df_region['Device ID'].unique())
for userID in users:
    if len(df_region[df_region['Device ID'] == userID]) > 900:
        users_top.append(userID)
len(users_top)

87

In [46]:
df_region.to_csv('balearesjulio.csv', sep=',', index=False)

### Usuarios por region (Madrid, Barcelona...)

In [None]:
def users_per_region(users, region):
    coords = [0, 0]
    j=0
    
    #Obtenemos los limites de la region seleccionada via Nominatim
    bbox = geolocator.geocode(region, timeout=30).raw['boundingbox']
    
    users_aux = users.copy()
    
    for k,v in users.items():
        for idx, row in v.iterrows():
            coords[0] += float(row['Latitude'])
            coords[1] += float(row['Longitude'])
            j+=1
        #Obtenemos el punto medio de las locaclizaciones de un usuario
        coords[0] = coords[0]/j;
        coords[1] = coords[1]/j;
        
        #Checkeamos si dicho usuario esta en la region de acuerdo a su punto medio
        if (coords[0] >= float(bbox[0]) 
            and coords[0] <= float(bbox[1]) 
            and coords[1] >= float(bbox[2]) 
            and coords[1] <= float(bbox[3])):
            pass
        else:
            users_aux.pop(k)
        
        coords = [0, 0]
        j=0
            
    return users_aux

In [None]:
users_Madrid = users_per_region(users, 'Madrid')
len(users_Madrid)

### Mapeo de usuarios

In [None]:
def get_avgcoords_users(users):
    coords = [0, 0]
    j=0
    coords_by_user={}
    
    for k,v in users.items():
        for idx, row in v.iterrows():
            coords[0] += float(row['Latitude'])
            coords[1] += float(row['Longitude'])
            j+=1
        #Obtenemos el punto medio de las locaclizaciones de un usuario
        coords[0] = coords[0]/j;
        coords[1] = coords[1]/j;
        coords_by_user[k] = coords
        
        coords = [0, 0]
        j=0
    return coords_by_user

In [None]:
coords_by_user = get_avgcoords_users(users_top_Madrid)

In [19]:
#styles = ["Stamen Terrain", "Stamen Toner", "Mapbox Bright"]
location = geolocator.geocode("Madrid")
m = folium.Map(location=(location.raw['lat'],location.raw['lon']), zoom_start=8)
for k,v in coords_by_user.items():
    folium.Marker(location=v, popup= 'ID = '+ str(k)).add_to(m)

NameError: name 'coords_by_user' is not defined

In [None]:
m

In [45]:
#Funcion para dibujar las velocidades en el camino con diferentes colores
def speed_color(speed):
    if speed < 0:
        raise ValueError
    elif speed >= 0 and speed < 10:
        return 'red'
    elif speed >= 10 and speed < 60:
        return 'yellow'
    else:
        return 'green'

In [46]:
userID = 4283

#styles = ["Stamen Terrain", "Stamen Toner", "Mapbox Bright"]
points = users_top_Madrid[userID]
ways = caminos_by_user(users_top_Madrid)[userID]
center = get_avgcoords_users(users_top_Madrid)[userID]
m = folium.Map(location=[center[0], center[1]], zoom_start=12)
    
for i in range(len(points)-1) : 
    dt_object = datetime.fromtimestamp(points.iloc[i, 0]) #Convertimos el tiempo unix en fecha legible
    #Obtenemos dos ubicaciones para poder ir dibujando la linea que los une (poligono)
    p1 = [points.iloc[i, 3], points.iloc[i, 4]]
    p2 = [points.iloc[i+1, 3], points.iloc[i+1, 4]]
    speed = round(ways['Speed (km/h)'][i], 2) #Redondeamos los decimales de la velocidad
    folium.PolyLine(locations=[p1, p2], color=speed_color(speed), tooltip=str(speed) +' km/h').add_to(m)
    if i==0:
        folium.Marker(location=p1,popup= 'Punto de inicio: ' + str(dt_object), icon=folium.Icon(color='green')
                 ).add_to(m)
    elif i == len(points)-2:
        folium.Marker(location=p2,popup= 'Punto final: ' + str(dt_object), icon=folium.Icon(color='red')
                 ).add_to(m)
    else:
        folium.Circle(radius=20,location=p1,popup=dt_object,color='orange',
                 ).add_to(m)

NameError: name 'users_top_Madrid' is not defined

In [None]:
m

### Graficas usuarios Madrid

In [None]:
from math import radians, degrees, sin, cos, asin, acos, sqrt
def great_circle(lon1, lat1, lon2, lat2):
    #print(lon1, lat1, lon2, lat2)
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    a=sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(lon1 - lon2)
    if a>1:
        a=1
    a=acos(a)
    return 6371 *a
    #return 6371 * (
    #    acos(sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(lon1 - lon2))
    #)

In [None]:
def puntos_a_caminos(d):
    caminos=[]
    time_total=0
    dist_total=0
    for idx, row in d.iterrows():
        if idx>0:
            #print(d.iloc[idx,4], d.iloc[idx,3], d.iloc[idx-1,4], d.iloc[idx-1,3])
            dist = great_circle(d.iloc[idx,4], d.iloc[idx,3], d.iloc[idx-1,4], d.iloc[idx-1,3])
            time = d.iloc[idx,0] - d.iloc[idx-1,0]
            if time>0:  # if time==0, speed=previous_speed
                speed = dist/time*3600  # conversion form seconds to hours
            time_total+=time
            dist_total+=dist
            accur = (d.iloc[idx, 5] + d.iloc[idx-1,5])/2
            offset = (d.iloc[idx, 6] + d.iloc[idx-1,6])/2
            #print(dist, time, speed)
            caminos.append([dist, time, speed, accur, offset])
    d = pd.DataFrame(caminos, columns = ['Distance', 'Time (s)', 'Speed (km/h)', 'Accuracy', 'Offset']) 
    #print("Recorridos " + str(dist_total) + " kms en " + str(time_total/3600) + " horas")
    return d

In [None]:
#Se obtiene un diccionario con clave = id usuario y valor = dataframe de caminos del usuario
def caminos_by_user(users):
    caminos={}
    for k,v in users.items():
        caminos[k] = puntos_a_caminos(v)   
        
    return caminos

In [None]:
users_top_Madrid.keys()

In [None]:
caminos_by_user(users_top_Madrid)[56957]