<a href="https://colab.research.google.com/github/ioadeer/bi-bycing-ba-2021/blob/main/bicing_2020_2021_clima_rutas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movilidad Sustentable
# Ciencia de Datos e Inteligencia de Negocios


## Configuración

In [None]:
"""
Bicicletas: Recorridos
Clima: PRCP TAVG TEMP[C] HUM[%] PNM[hPa] DD[gr] FF[km/hr]                                                 
2020-2021
Data acquisition & wrangling
"""
import os
import sys
import urllib.request

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
# from datetime import date, timedelta
import re


datadir = 'datasets/'

In [None]:
from google.colab import drive
drive.mount(
  '/content/drive/', 
  # force_remount = True
)
if drive:
  datadir = '/content/drive/MyDrive/Academicos/UTN/BI/datasets/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Adquisición


In [None]:
insumos = {
  "recorridos-2020" : {
    "remote" : 'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte/bicicletas-publicas/recorridos-realizados-2020.zip',
    "local" : datadir + 'recorridos-realizados-2020.zip',
    "df" : pd.DataFrame(),
  },
  "recorridos-2021" : {
    "remote" : 'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte/bicicletas-publicas/recorridos-realizados-2021.zip',   
    "local" : datadir + 'recorridos-realizados-2021.zip',
    "df" : pd.DataFrame(),
  },
  "clima" : {
    "remote" : '',   
    "local" : datadir + 'weather-data-ba.csv',
    "df" : pd.DataFrame(),
  },
  "estaciones" : {
    "remote" : 'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte/estaciones-bicicletas-publicas/nuevas-estaciones-bicicletas-publicas.xlsx',   
    "local" : datadir + 'nuevas_estaciones.xlsx',
    "df" : pd.DataFrame(),
  },
  # "rutas" : {
  #   "remote" : '',   
  #   "local" : datadir + 'rutas.csv',
  #   "df" : pd.DataFrame(),
  # },
}

In [None]:
"""
Cargar insumos
"""
def cargar_insumos( insumos ):
    for i in insumos:
        if not os.path.exists( insumos[i]["local"] ):
            print("Descargando:", insumos[i]["remote"])
            urllib.request.urlretrieve(
              insumos[i]["remote"],
              insumos[i]["local"]
            )
        if "zip" in insumos[i]["local"]:
            insumos[i]["df"] = pd.read_csv(
                insumos[i]["local"],
                compression = 'zip',
                encoding = 'utf-8', 
            )
        if "csv" in insumos[i]["local"]:
            insumos[i]["df"] = pd.read_csv(
                insumos[i]["local"],
            )
        if "xlsx" in insumos[i]["local"]:
            insumos[i]["df"] = pd.read_excel(
                insumos[i]["local"],
            )
        print( 'Cargado: ', insumos[i]["local"] )
    return insumos

In [None]:
insumos = cargar_insumos( insumos )

Cargado:  /content/drive/MyDrive/Academicos/UTN/BI/datasets/recorridos-realizados-2020.zip
Cargado:  /content/drive/MyDrive/Academicos/UTN/BI/datasets/recorridos-realizados-2021.zip
Cargado:  /content/drive/MyDrive/Academicos/UTN/BI/datasets/weather-data-ba.csv
Cargado:  /content/drive/MyDrive/Academicos/UTN/BI/datasets/nuevas_estaciones.xlsx


# Subproductos

In [None]:
derivados = {
  "recorridos" : {
    "df"     : pd.DataFrame(),
    "local"  : datadir + 'recorridos.csv',
    "remote" : '',   
  },
  "vueltas" : {
    "df"     : pd.DataFrame(),
    "local"  : datadir + 'vueltas.csv',
    "remote" : '',   
  },
  "estaciones" : {
    "df"     : pd.DataFrame(),
    "local"  : datadir + 'estaciones.csv',
    "remote" : '',   
  },
  "distancias" : {
    "df"     : pd.DataFrame(),
    "local"  : datadir + 'distancias_estaciones.csv',
    "remote" : '',   
  },
  # "rutas" : {
  #   "df"     : pd.DataFrame(),
  #   "local"  : datadir + 'rutas_estaciones.csv',
  #   "remote" : '',   
  # },
  "demanda" : {
    "df"     : pd.DataFrame(),
    "local"  : datadir + 'demanda.csv',
    "remote" : '',   
  },
  # "salidas" : {
  #   "df"     : pd.DataFrame(),
  #   "local"  : datadir + '',
  #   "remote" : '',   
  # },
  # "arribos" : {
  #   "df"     : pd.DataFrame(),
  #   "local"  : datadir + '',
  #   "remote" : '',   
  # },
  "clima" : {
    "df"     : pd.DataFrame(),
    "local"  : datadir + 'clima_datohorario_2020_2021.csv', 
    "remote" : '',
  },
  "pronostico" : {
    "df"     : pd.DataFrame(),
    "local"  : datadir + 'pronostico_2020_2021.csv', 
    "remote" : '',
  },
}

 # Normalizacion


In [None]:
# renombrar y filtrar columnas  2021
encabezados2021 = {
    'ID'                                  : 'id',
    'Duración'                            : 'duracion_recorrido',
    'Estado cerrado'                      : 'estado_cerrado',
    'ID de ciclista'                      : 'id_usuario',
    'Tipo de ciclista'                    : 'cilicsta_tipo',
    'Modelo de bicicleta'                 : 'bicicleta_modelo',   
    'Id de estación de inicio'            : 'id_estacion_origen', 
    'Nombre de estación de inicio'        : 'nombre_estacion_origen',
    'Fecha de inicio'                     : 'fecha_origen_recorrido',     
    'Origen de viaje'                     : 'direccion_estacion_origen',  
#     'lon_estacion_origen'                : 'lon_estacion_origen',        
#     'lat_estacion_origen'                 : 'lat_estacion_origen',
    'Id de estación de fin de viaje'      : 'id_estacion_destino',     
    'Nombre de estación de fin de viaje'  : 'nombre_estacion_destino',
    'Fecha de fin'                        : 'fecha_destino_recorrido',          
#     'direccion_estacion_destino'          : 'direccion_estacion_origen',           
#     'lon_estacion_destino'               : 'lon_estacion_destino',       
#     'lat_estacion_destino'                : 'lat_estacion_destino',
#     'periodo'                             : 'periodo',  
}
insumos["recorridos-2021"]["df"] = insumos["recorridos-2021"]["df"].rename( 
    columns = encabezados2021
)
insumos["recorridos-2021"]["df"]["id"] = insumos["recorridos-2021"]["df"]['id'].astype('Int64')

In [None]:
# Concatenar recorridos 2020:2021
recorridos = [
    insumos["recorridos-2020"]["df"],
    insumos["recorridos-2021"]["df"]
]
derivados["recorridos"]["df"] = pd.concat( recorridos )
del recorridos  

In [None]:
encabezados = {
    'id'                         : 'id',
    'duracion_recorrido'         : 'duracion',
    'estado_cerrado'             : 'estado_cerrado',
    'id_usuario'                 : 'usuario_id',
#     'cilicsta_tipo'            : 'cilicsta_tipo',
#     'bicicleta_modelo'         : 'bicicleta_modelo',   
    'id_estacion_origen'         : 'origen_id', 
    'nombre_estacion_origen'     : 'origen_nombre',
    'fecha_origen_recorrido'     : 'origen_fecha',     
    'direccion_estacion_origen'  : 'origen_direccion',
    'lat_estacion_origen'        : 'origen_lat',  
    'long_estacion_origen'       : 'origen_lon',        
    'id_estacion_destino'        : 'destino_id',     
    'nombre_estacion_destino'    : 'destino_nombre',
    'fecha_destino_recorrido'    : 'destino_fecha',          
    'direccion_estacion_destino' : 'destino_direccion', 
    'lat_estacion_destino'       : 'destino_lat',          
    'long_estacion_destino'      : 'destino_lon',       
#     'periodo'                    : 'periodo',  
}
derivados["recorridos"]["df"] = derivados["recorridos"]["df"].rename( columns = encabezados )
derivados["recorridos"]["df"] = derivados["recorridos"]["df"][ list( encabezados.values() ) ]

In [None]:
derivados["recorridos"]["df"]['destino_nombre'] = derivados["recorridos"]["df"]['destino_nombre'].fillna(
    '999 - Vacio'
)
derivados["recorridos"]["df"]['destino_nombre'] = derivados["recorridos"]["df"]['destino_nombre'].str.replace(
    'Balboa Definitivo', 
    '666 - Balboa Definitivo', 
    regex = False
)

In [None]:
"""
Separar las columnas nombre de estacion de origen y nombre de estacion de destino 
Tienen la info correcta del id de las estaciones.
Divido la columna Nombre de estacion de fin de viaje porque contiene dos campos 
el Id y el nombre de la estacion de inicio de viaje.
"""
derivados["recorridos"]["df"][
    ['origen_id','origen_nombre']
] = derivados["recorridos"]["df"]['origen_nombre'].str.split('-', expand = True )
derivados["recorridos"]["df"]['origen_id'] = derivados["recorridos"]["df"]['origen_id'].str.strip(' ')
derivados["recorridos"]["df"]['origen_nombre'] = derivados["recorridos"]["df"]['origen_nombre'].str.strip(' ')

derivados["recorridos"]["df"][
    ['destino_id','destino_nombre']
] = derivados["recorridos"]["df"]['destino_nombre'].str.split('-', expand = True )
derivados["recorridos"]["df"]['destino_id'] = derivados["recorridos"]["df"]['destino_id'].str.strip(' ')
derivados["recorridos"]["df"]['destino_nombre'] = derivados["recorridos"]["df"]['destino_nombre'].str.strip(' ')

In [None]:
derivados["recorridos"]["df"]['origen_id'] = derivados["recorridos"]["df"]["origen_id"].str.lstrip('0').astype(int)
derivados["recorridos"]["df"]['destino_id'] = derivados["recorridos"]["df"]["destino_id"].str.lstrip('0').astype(int)

derivados["recorridos"]["df"]["origen_nombre"]  = derivados["recorridos"]["df"]["origen_nombre"].str.upper()
derivados["recorridos"]["df"]["destino_nombre"]  = derivados["recorridos"]["df"]["destino_nombre"].str.upper()

derivados["recorridos"]["df"]["origen_direccion"]  = derivados["recorridos"]["df"]["origen_direccion"].str.upper()
derivados["recorridos"]["df"]["destino_direccion"]  = derivados["recorridos"]["df"]["destino_direccion"].str.upper()

## RESAMPLING (REMUESTREO)

### Downsampling frequencies 

Sample size = minutes

In [None]:
formato = '%Y%m%d %H:%M:%S'
derivados["recorridos"]["df"]['origen_datetime'] = pd.to_datetime(
    derivados["recorridos"]["df"]['origen_fecha'], 
    format = formato
)
derivados["recorridos"]["df"]['fecha'] = derivados["recorridos"]["df"]['origen_datetime'].dt.strftime('%Y-%m-%d')
derivados["recorridos"]["df"]['wdia'] = derivados["recorridos"]["df"]['origen_datetime'].dt.strftime('%w').astype(str).astype(int)
derivados["recorridos"]["df"].eval( 'dhabil = wdia > 4', inplace = True )
derivados["recorridos"]["df"]['hora'] = derivados["recorridos"]["df"]['origen_datetime'].dt.strftime('%H').astype(str).astype(int)

# derivados["recorridos"]["df"]['nmes'] = derivados["recorridos"]["df"]['origen_datetime'].dt.strftime('%m').astype(str).astype(int)
# derivados["recorridos"]["df"]['mes']  = bicicletas["recorridos_df"]['start_datetime'].dt.strftime('%b').str.lower()
# derivados["recorridos"]["df"]['ndia'] = derivados["recorridos"]["df"]['origen_datetime'].dt.strftime('%j').astype(str).astype(int)
# derivados["recorridos"]["df"]['hora'] = derivados["recorridos"]["df"]['hora'].astype(str).astype(int)
# derivados["recorridos"]["df"]['mins'] = derivados["recorridos"]["df"]['origen_datetime'].dt.strftime('%M')

# SUBPRODUCTOS / CLUSTERING

# GRUPING & RESHAIPING
## DEMANDA HORARIA

In [None]:
# https://pandas.pydata.org/docs/user_guide/reshaping.html
# demanda_horaria.pivot( index = "origen_id", columns="hora", values="salidas")
# demanda_horaria.stack()
# https://stackoverflow.com/questions/28651079/pandas-unstack-problems-valueerror-index-contains-duplicate-entries-cannot-re
# https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/

demanda_horaria = derivados["recorridos"]["df"]
# Fitro x dia
# demanda_horaria = demanda_horaria[demanda_horaria["fecha"] == '2020-01-01' ]
demanda_horaria = demanda_horaria.groupby(
    ['origen_nombre', 'origen_id','hora',]
).agg(
  salidas = ( 'hora',  'count'),
)
# .reset_index()
# df.set_index(['id', 'date', 'location'], append=True)
demanda_horaria = demanda_horaria.pivot_table(
    index = ['origen_nombre',  'origen_id'],
    columns ="hora",
    values = "salidas",
    # aggfunc ='sum',
).sort_values('origen_id')
demanda_horaria

Unnamed: 0_level_0,hora,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
origen_nombre,origen_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
FACULTAD DE DERECHO,1,395.0,252.0,148.0,102.0,76.0,151.0,440.0,598.0,938.0,904.0,1056.0,1221.0,1536.0,1589.0,1697.0,2303.0,3026.0,3491.0,3795.0,3724.0,2632.0,1564.0,802.0,519.0
RETIRO I,2,72.0,30.0,12.0,6.0,5.0,59.0,120.0,255.0,336.0,440.0,330.0,397.0,465.0,550.0,563.0,596.0,654.0,780.0,843.0,397.0,322.0,271.0,153.0,82.0
ADUANA,3,99.0,84.0,44.0,20.0,25.0,167.0,204.0,509.0,614.0,601.0,588.0,599.0,759.0,899.0,848.0,907.0,844.0,685.0,530.0,367.0,260.0,197.0,141.0,147.0
PLAZA ROMA,4,79.0,41.0,29.0,21.0,13.0,67.0,160.0,238.0,363.0,433.0,251.0,298.0,373.0,397.0,407.0,528.0,625.0,703.0,511.0,350.0,201.0,131.0,125.0,93.0
PLAZA ITALIA,5,655.0,533.0,408.0,231.0,129.0,337.0,376.0,807.0,1197.0,1044.0,1044.0,1477.0,1599.0,1757.0,1917.0,2299.0,2695.0,3017.0,3124.0,2598.0,2226.0,1316.0,932.0,767.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BARRIO 31,393,93.0,63.0,27.0,14.0,37.0,466.0,460.0,257.0,494.0,246.0,294.0,397.0,354.0,552.0,342.0,449.0,515.0,647.0,532.0,308.0,238.0,190.0,161.0,111.0
AV. LA PLATA,394,1.0,2.0,,1.0,,6.0,13.0,41.0,22.0,12.0,51.0,44.0,54.0,49.0,37.0,65.0,77.0,95.0,71.0,43.0,40.0,19.0,11.0,13.0
AGRONOMIA II,395,7.0,5.0,3.0,3.0,,10.0,,7.0,14.0,43.0,12.0,45.0,20.0,19.0,28.0,59.0,74.0,63.0,64.0,49.0,22.0,29.0,10.0,6.0
PLAZA ALBERTI,396,384.0,323.0,165.0,91.0,50.0,84.0,250.0,384.0,685.0,838.0,878.0,1059.0,1145.0,1319.0,1168.0,1435.0,1527.0,1777.0,1940.0,1623.0,1380.0,970.0,638.0,560.0


### SUMINISTRO HORARIO




In [None]:
suministro_horario = derivados["recorridos"]["df"].groupby(
    [ 'destino_nombre',  'destino_id', 'hora' ]
).agg(
  salidas = ( 'hora',  'count'),
)
suministro_horario = suministro_horario.pivot_table(
    index = ['destino_nombre',  'destino_id'],
    columns = "hora",
    values = "salidas",
    # aggfunc = 'sum',
).sort_values('destino_id')
suministro_horario

Unnamed: 0_level_0,hora,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
destino_nombre,destino_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
FACULTAD DE DERECHO,1,336.0,197.0,130.0,46.0,48.0,377.0,372.0,744.0,1081.0,1040.0,1159.0,1406.0,1616.0,1582.0,2040.0,2702.0,3410.0,3580.0,3904.0,2938.0,2112.0,1193.0,745.0,533.0
RETIRO I,2,28.0,27.0,12.0,3.0,10.0,182.0,204.0,327.0,491.0,424.0,389.0,480.0,497.0,657.0,552.0,643.0,675.0,745.0,507.0,341.0,249.0,240.0,101.0,51.0
ADUANA,3,48.0,43.0,24.0,17.0,49.0,251.0,393.0,1064.0,852.0,950.0,730.0,748.0,726.0,865.0,725.0,577.0,571.0,479.0,438.0,346.0,228.0,183.0,180.0,80.0
PLAZA ROMA,4,41.0,27.0,21.0,14.0,29.0,21.0,181.0,379.0,578.0,617.0,473.0,424.0,432.0,497.0,505.0,409.0,478.0,452.0,473.0,228.0,191.0,141.0,169.0,100.0
PLAZA ITALIA,5,517.0,475.0,258.0,122.0,155.0,204.0,486.0,698.0,771.0,825.0,1177.0,1463.0,1673.0,1742.0,1835.0,2266.0,2857.0,3155.0,2707.0,2334.0,1688.0,1174.0,744.0,642.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PLAZA ALBERTI,396,504.0,324.0,206.0,100.0,58.0,121.0,83.0,287.0,515.0,662.0,734.0,972.0,1095.0,1115.0,1068.0,1400.0,1559.0,1865.0,1872.0,2025.0,1794.0,1167.0,757.0,682.0
PLAZA MATHEU,397,,,,,,,,,,,,1.0,1.0,,,,,,,,,,,
GARCIA DEL RIO,399,527.0,322.0,199.0,101.0,52.0,40.0,176.0,497.0,969.0,821.0,1037.0,1229.0,1323.0,1487.0,1565.0,2262.0,2594.0,2802.0,3014.0,2710.0,2144.0,1287.0,711.0,666.0
BALBOA DEFINITIVO,666,1.0,,,1.0,1.0,,,,2.0,,1.0,5.0,9.0,,,2.0,,2.0,3.0,1.0,2.0,4.0,1.0,1.0


In [None]:
demanda_diaria = derivados["recorridos"]["df"].groupby(
    ['origen_nombre', 'origen_id','wdia']
).agg(
  salidas = ( 'wdia',  'count'),
)
demanda_diaria = demanda_diaria.pivot_table(
    index   = ['origen_nombre',  'origen_id'],
    columns = "wdia",
    values  = "salidas",
    # aggfunc = 'count',
).sort_values('origen_id')
demanda_diaria 

Unnamed: 0_level_0,wdia,0,1,2,3,4,5,6
origen_nombre,origen_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
FACULTAD DE DERECHO,1,5806,4792,4357,4371,4298,4175,5160
RETIRO I,2,880,1234,1179,1219,1156,1182,888
ADUANA,3,858,1627,1712,1754,1773,1725,689
PLAZA ROMA,4,616,1052,1016,1072,1086,1058,537
PLAZA ITALIA,5,4906,4599,4493,4617,4697,4544,4629
...,...,...,...,...,...,...,...,...
BARRIO 31,393,493,1133,1230,1222,1266,1154,749
AV. LA PLATA,394,55,124,120,156,136,127,49
AGRONOMIA II,395,44,101,111,115,91,94,36
PLAZA ALBERTI,396,2705,2944,2990,3052,3083,3129,2770


## SALIDAS Y ARRIBOS

In [None]:
salidas = derivados["recorridos"]["df"].groupby(
    ['origen_id', 'origen_direccion','origen_nombre','origen_lat','origen_lon' ]
).agg(
  salidas = pd.NamedAgg( column = "origen_id", aggfunc = "count" ),
).reset_index()

# arribos = pd.DataFrame()
arribos = derivados["recorridos"]["df"].groupby(
    ['destino_id', 'destino_direccion','destino_nombre','destino_lat','destino_lon']
).agg(
  arribos = pd.NamedAgg( column = "destino_id", aggfunc = "count" ),
).reset_index()

# del derivados["estaciones"]["df"] 
derivados["estaciones"]["df"] = pd.merge(
    left     = salidas,
    right    = arribos,
    how      = 'left',
    left_on  = ['origen_id','origen_nombre'],
    right_on = ['destino_id','destino_nombre']
)
del [ salidas , arribos ]
orden_filtro = {
    'origen_id'        :'id',
    'origen_direccion' :'direccion',
    'origen_nombre'    :'nombre',
    'origen_lon'       :'lon',
    'origen_lat'       :'lat',
    'salidas'          :'salidas',
    'arribos'          :'arribos',
#     'destino_id',
#     'destino_direccion',
#     'destino_nombre',
#     'destino_lon',
#     'destino_lat',
}
derivados["estaciones"]["df"] = derivados["estaciones"]["df"].rename( columns = orden_filtro )
derivados["estaciones"]["df"] = derivados["estaciones"]["df"][ list( orden_filtro.values() ) ]
derivados["estaciones"]["df"] = derivados["estaciones"]["df"].sort_values(
  by = ['salidas'], ascending = [False]
)

In [None]:
derivados["estaciones"]["df"]

Unnamed: 0,id,direccion,nombre,lon,lat,salidas,arribos
115,160,GODOY CRUZ 3279 Y LIBERTADOR,GODOY CRUZ Y LIBERTADOR,-58.420626,-34.572583,30295,30386
11,14,"SANTA FE AV. & BULLRICH, INT. AV.",PACIFICO,-58.426385,-34.577424,29080,27442
179,292,OLLEROS AV. & DEL LIBERTADOR AV.,PLAZA BOLIVIA,-58.436113,-34.563539,28774,29526
23,29,AV. PATRICIAS ARGENTINAS & ESTIVAO,PARQUE CENTENARIO,-58.433556,-34.607942,28637,27642
162,255,"SUCRE, ANTONIO JOSE DE, MCAL. Y VERTIZ VIRREY AV.",BARRANCAS DE BELGRANO,-58.448313,-34.559801,25619,25623
...,...,...,...,...,...,...,...
6,7,CARLOS PELEGRINI 215,OBELISCO,-58.381094,-34.606499,1803,2211
192,373,COCHABAMBA 1755,JORGELINA DE SIMONE,-58.390478,-34.623924,1434,1548
77,108,"CAFFARENA, AGUSTIN R. & DON PEDRO DE MENDOZA AV.",USINA DEL ARTE,-58.356254,-34.628759,1391,1460
106,150,AV. ESPAÑA 2200,RODRIGO BUENO,-58.355744,-34.618841,1324,1350


In [None]:
# GRABAR CSV ESTACIONES
# derivados["estaciones"]["df"].to_csv(
#   derivados["estaciones"]["local"],
#   index = False,
# )

## FEATURE DISCOVERY

### duracion_cat (catergories)

In [None]:
# DURACION EN MINUTOS 
derivados["recorridos"]["df"].eval('duracion_mins = duracion / 60', inplace = True)
derivados["recorridos"]["df"]["duracion_mins"] = derivados["recorridos"]["df"]["duracion_mins"].astype(int)
# MINUTOS TO CATEGORIES
# labels = [ "{0}-{1}".format(i, i + 15) for i in range(0, 265, 15) ]
dur_cortes    = [ 0,     5,   15,  30,    45,  60,   90,  120,  240, 320, 60000 ]
dur_etiquetas = [ 'XXS', 'XS', 'S', 'SM', 'M', 'ML', 'L', 'XL', 'XXL', 'OS' ]
derivados["recorridos"]["df"]["duracion_cat"] = pd.cut( 
    derivados["recorridos"]["df"].duracion_mins,
    dur_cortes,
    include_lowest = True,
    labels = dur_etiquetas
   # right = False,
)
derivados["recorridos"]["df"]['duracion_cat'].unique()

['XS', 'S', 'ML', 'SM', 'M', ..., 'XL', 'XXS', 'L', 'OS', NaN]
Length: 11
Categories (10, object): ['XXS' < 'XS' < 'S' < 'SM' ... 'L' < 'XL' < 'XXL' < 'OS']

In [None]:
# derivados["recorridos"]["df"][derivados["recorridos"]["df"]['duracion_cat'] == 'M' ]

### TIPO DE RECORRIDO: VUELTA / IDA

In [None]:
derivados["recorridos"]["df"]['vuelta'] = np.where( 
  derivados["recorridos"]["df"]['origen_id'] == derivados["recorridos"]["df"]['destino_id'], 1, 0
)

In [None]:
# FILTRAR VUELTAs y duracion_cat
vueltas = derivados["recorridos"]["df"][ derivados["recorridos"]["df"]['vuelta'] == 1]
vueltas[ vueltas['duracion_cat'] == 'OS'  ].sort_values("duracion_mins", ascending = False)


Unnamed: 0,id,duracion,estado_cerrado,usuario_id,origen_id,origen_nombre,origen_fecha,origen_direccion,origen_lat,origen_lon,destino_id,destino_nombre,destino_fecha,destino_direccion,destino_lat,destino_lon,origen_datetime,fecha,wdia,dhabil,hora,duracion_mins,duracion_cat,vuelta
1475286,,3190094,,697270,393,BARRIO 31,2020-12-06 22:05:39,CARLOS H. PERETTE 11,-34.582422,-58.378858,393,BARRIO 31,2021-01-12 20:13:53,CARLOS H. PERETTE 11,-34.582422,-58.378858,2020-12-06 22:05:39,2020-12-06,0,False,22,53168,OS,1
1709305,,1481265,,621844,104,FEDERICO LACROZE,2020-01-27 11:08:25,AV. GUZMAN 1100 & AV. CORRIENTES,-34.587617,-58.455211,104,FEDERICO LACROZE,2020-02-13 14:36:10,AV. GUZMAN 1100 & AV. CORRIENTES,-34.587617,-58.455211,2020-01-27 11:08:25,2020-01-27,1,False,11,24687,OS,1
1998307,,1150420,,654493,108,USINA DEL ARTE,2020-02-15 11:53:58,"CAFFARENA, AGUSTIN R. & DON PEDRO DE MENDOZA AV.",-34.628759,-58.356254,108,USINA DEL ARTE,2020-02-28 19:27:38,"CAFFARENA, AGUSTIN R. & DON PEDRO DE MENDOZA AV.",-34.628759,-58.356254,2020-02-15 11:53:58,2020-02-15,6,True,11,19173,OS,1
1935050,,702324,,662154,365,VENANCIO FLORES,2020-03-07 16:33:55,"ARGERICH & FLORES, VENANCIO, GRAL.",-34.630109,-58.473844,365,VENANCIO FLORES,2020-03-15 19:39:19,"ARGERICH & FLORES, VENANCIO, GRAL.",-34.630109,-58.473844,2020-03-07 16:33:55,2020-03-07,6,True,16,11705,OS,1
1934905,,689224,,445415,365,VENANCIO FLORES,2020-08-12 14:37:57,"ARGERICH & FLORES, VENANCIO, GRAL.",-34.630109,-58.473844,365,VENANCIO FLORES,2020-08-20 14:05:01,"ARGERICH & FLORES, VENANCIO, GRAL.",-34.630109,-58.473844,2020-08-12 14:37:57,2020-08-12,3,False,14,11487,OS,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976755,,19290,,40927,75,PLAZA PRIMERO DE MAYO,2020-02-06 12:44:37,238 PICHINCHA,-34.612299,-58.398984,75,PLAZA PRIMERO DE MAYO,2020-02-06 18:06:07,238 PICHINCHA,-34.612299,-58.398984,2020-02-06 12:44:37,2020-02-06,4,False,12,321,OS,1
1935164,,19311,,238619,365,VENANCIO FLORES,2020-01-19 16:30:34,"ARGERICH & FLORES, VENANCIO, GRAL.",-34.630109,-58.473844,365,VENANCIO FLORES,2020-01-19 21:52:25,"ARGERICH & FLORES, VENANCIO, GRAL.",-34.630109,-58.473844,2020-01-19 16:30:34,2020-01-19,0,False,16,321,OS,1
1361526,,19266,,164800,352,SAN JOSE DE FLORES,2020-01-17 20:46:50,AVENIDA RIVADAVIA Y FRAY CAYETANO,-34.629084,-58.463251,352,SAN JOSE DE FLORES,2020-01-18 02:07:56,AVENIDA RIVADAVIA Y FRAY CAYETANO,-34.629084,-58.463251,2020-01-17 20:46:50,2020-01-17,5,True,20,321,OS,1
1520749,,19289,,27010,281,VILLA URQUIZA,2020-02-09 00:51:03,TRIUNVIRATO AV. & ROOSEVELT FRANKLIN D. AV.,-34.573795,-58.487074,281,VILLA URQUIZA,2020-02-09 06:12:32,TRIUNVIRATO AV. & ROOSEVELT FRANKLIN D. AV.,-34.573795,-58.487074,2020-02-09 00:51:03,2020-02-09,0,False,0,321,OS,1


### DISTANCIAS


In [186]:
!pip install openrouteservice



In [187]:
estaciones = [ x for x in derivados["estaciones"]["df"].set_index('id').sort_values('id').iterrows() ]
rutas = {}
nid = 0
for o in estaciones:
  origen_id = int(o[0])
  # https://ask.openrouteservice.org/t/could-not-find-point-within-a-radius-of-350-0-meters/2668/16
  origen_coords = ( o[1]['lon'], o[1]['lat'] ) # ORS takes lon,lat 
  for d in estaciones:
    destino_id = int(d[0])
    destino_coords = ( d[1]['lon'], d[1]['lat'] ) # ibidem
    ruta_id = str( origen_id ) + 'to' + str( destino_id )
    vuelta_id = str( destino_id) + 'to' + str( origen_id )
    ruta = {
      "id" : ruta_id,
      "nid" : nid,
      "origen_id": origen_id,
      "destino_id": destino_id,
      "coords" : ( origen_coords, destino_coords ),
      "distancia" : 0,
      "duracion_estimada" : 0,
      "consultar" : ( origen_id != destino_id ),
      "complemetaria" : vuelta_id,
      "preexiste" : (ruta_id in rutas or vuelta_id in rutas),
    }
    nid+=1
    rutas[ruta_id] = ruta
print( 'rutas:', len( rutas ) )

# del rutas_calcular
rutas_calcular = {}
chunk = 0
cuenta_rutas = 0
for r in rutas:
  ruta = rutas[r]
  if ruta['consultar'] and not ruta["preexiste"]:
    if( (cuenta_rutas % 2000) == 0 ): # due to ors dayly cuota
      chunk = chunk + 1  
    ruta["chunk"] = chunk
    rutas_calcular[r] = ruta
    cuenta_rutas = cuenta_rutas + 1
print( 'rutas a calular:', len( rutas_calcular ) )



rutas: 40000
rutas a calular: 19701


In [189]:
from time import sleep
import openrouteservice as ors

ors_client = ors.Client(
    key = '5b3ce3597851110001cf624827a63bd0239f438caa1f7504352a9fc8'
) 
# del rutas_calculadas
rutas_calculadas = pd.DataFrame()
cuenta_calculadas = 1
for rc in rutas_calcular:
  ruta = rutas[rc]
  if ruta['chunk'] == 1: # CAMBIAR MAÑANA
    if ( ( cuenta_calculadas % 40) == 0): # due to ors minute cuota
      print( 'durmiendo', cuenta_calculadas )
      print( 'rutas calculadas:', rutas_calculadas.size )
      rutas_calculadas
      sleep(60)
    ruta['directions'] = ors_client.directions(
      ruta['coords'],
      profile = 'cycling-regular',
    )
    cuenta_calculadas = cuenta_calculadas + 1
    rutas_calculadas = rutas_calculadas.append(
       ruta,
       ignore_index = True
    )
print( 'rutas calculadas:', rutas_calculadas.size )
# rutas_df = pd.concat(  [ insumos["rutas"]["df"] , rutas_df ] )

rutas_calculadas.to_csv(
  datadir + 'rutas.csv',
  index = False,
)
# rutas_df.reset_index()

ApiError: ignored

In [None]:
rutas_calculadas

In [None]:
rutas_df = pd.read_csv(
  datadir + 'rutas.csv',
)
rutas_df

In [None]:
# GEOPY

# import geopy

# # DF DISTANCIAS
# distancias = pd.DataFrame()
# estaciones = [ x for x in derivados["estaciones"]["df"].set_index('id').sort_values('id').iterrows() ]
# for o in estaciones:
#   print(cuenta_calls)
#   origen_id = int( o[0] )
#   origen_geocoords = ( o[1]['lat'], o[1]['lon'] )
#   # distancias[origen_id] = {}
#   for d in estaciones:
#     destino_id = int( d[0] )
#     destino_geocoords = ( d[1]['lat'], d[1]['lon'] ) 
#     fila = {
#       "id"  : str(origen_id) + 'to' + str(destino_id),
#       "origen_id": origen_id,
#       "destino_id": destino_id,
#       "distancia" : 0,
#     }
#     if (
#         destino_id != origen_id
#     ): 
#       geocoords = ( origen_coords, destino_coords )
#       distancia = round( 
#         geopy.distance.vincenty( origen_geocoords, destino_geocoords ).km, 
#         2
#       ) 
#       fila["distancia"] = distancia
#     distancias = distancias.append( fila, ignore_index = True )
# distancias['origen_id'] = distancias['origen_id'].astype(int)
# distancias
# distancias.to_csv(
#   datadir + 'distancias.csv',
#   index = False,
# )

In [None]:
distancias

Unnamed: 0,destino_id,distancia,duracion_estimada,origen_id
0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0
2,2.0,2001.4,476.2,1.0
3,1.0,2001.4,476.2,2.0
4,3.0,4601.2,996.7,1.0
...,...,...,...,...
3935,12.0,9931.1,1997.7,249.0
3936,250.0,8618.7,1724.3,12.0
3937,12.0,8618.7,1724.3,250.0
3938,251.0,6782.1,1356.4,12.0


### data aligment / Pairing

In [None]:
# derivados["recorridos"]["df"]["distancia"] = 0
derivados["recorridos"]["df"] = pd.merge(
    left     = derivados["recorridos"]["df"],
    right    = distancias,
    how      = 'left', 
    left_on  = ['origen_id','destino_id'], 
    right_on = ['origen_id','destino_id']
)

In [None]:
derivados["recorridos"]["df"]["distancia"].max()

14.46

In [None]:
# derivados["recorridos"]["df"]["distnacia"].unique()
dis_cortes    = [ 0,    2,   4,    6,   8,    10,   12,  14 ]
dis_etiquetas = [ 'XS', 'S', 'SM', 'M', 'ML', 'L', 'XL']
derivados["recorridos"]["df"]["distancia_cat"] = pd.cut( 
    derivados["recorridos"]["df"].distancia,
    dis_cortes,
    # include_lowest = True,
    labels = dis_etiquetas,
    ordered = True,
   # right = False,
)
derivados["recorridos"]["df"]['distancia_cat'].unique()

['S', 'SM', 'XS', 'M', 'ML', NaN, 'L', 'XL']
Categories (7, object): ['XS' < 'S' < 'SM' < 'M' < 'ML' < 'L' < 'XL']

In [None]:
derivados["recorridos"]["df"][derivados["recorridos"]["df"]["vuelta"] == 0 ]

Unnamed: 0,usuario_id,origen_id,origen_nombre,destino_id,destino_nombre,vuelta,duracion,duracion_mins,duracion_cat,distancia,distancia_cat,velocidad_kmm,velocidad_cat,origen_datetime,fecha,wdia,hora,dhabil,DD,FF,HUM,PNM,TEMP,PRCP,TAVG
0,666202,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,882,14,XS,2.43,S,0.173571,nrapido,2020-12-31 23:54:22,2020-12-31,4,23,False,320,17,45.0,1013.2,22.7,,22.3
1,332689,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,894,14,XS,2.43,S,0.173571,nrapido,2020-12-29 12:39:18,2020-12-29,2,12,False,50,11,70.0,1008.6,26.4,0.0,26.1
2,57949,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,1361,22,S,2.43,S,0.110455,normal,2020-12-21 21:21:38,2020-12-21,1,21,False,90,13,71.0,1016.3,21.1,,19.6
3,454846,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,1010,16,S,2.43,S,0.151875,nrapido,2020-12-20 21:58:19,2020-12-20,0,21,False,200,7,38.0,1014.0,20.5,,18.6
4,690975,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,1490,24,S,2.43,S,0.101250,normal,2020-12-20 13:32:53,2020-12-20,0,13,False,230,20,39.0,1012.1,20.6,,18.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3362562,62246,292,PLAZA BOLIVIA,44,ECOPARQUE,0,690,11,XS,2.43,S,0.220909,rapido,2021-01-01 00:09:43,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3
3362563,445201,79,AZUCENA VILLAFLOR,168,ESTADOS UNIDOS,0,1360,22,S,1.75,XS,0.079545,lento,2021-01-01 00:08:40,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3
3362564,554162,79,AZUCENA VILLAFLOR,8,CONGRESO,0,1169,19,S,2.35,S,0.123684,normal,2021-01-01 00:08:37,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3
3362565,51005,79,AZUCENA VILLAFLOR,75,PLAZA PRIMERO DE MAYO,0,1031,17,S,3.22,S,0.189412,nrapido,2021-01-01 00:06:50,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3


### VELOCIDAD

In [None]:
derivados["recorridos"]["df"].eval( 'velocidad_kmm = distancia / duracion_mins', inplace = True )
derivados["recorridos"]["df"][ derivados["recorridos"]["df"]["distancia_cat"] == "XL" ]

Unnamed: 0,id,duracion,estado_cerrado,usuario_id,origen_id,origen_nombre,origen_fecha,origen_direccion,origen_lat,origen_lon,destino_id,destino_nombre,destino_fecha,destino_direccion,destino_lat,destino_lon,origen_datetime,fecha,wdia,dhabil,hora,duracion_mins,duracion_cat,vuelta,distancia,distancia_cat,velocidad_kmm
551928,,4555,,62168,399,GARCIA DEL RIO,2020-02-15 14:15:10,GARCIA DEL RIO & PINTO,-34.550014,-58.476521,117,HUMBERTO 1°,2020-02-15 15:31:05,PERU 1016,-34.620102,-58.374172,2020-02-15 14:15:10,2020-02-15,6,True,14,75,ML,0,12.19,XL,0.162533
805074,,3411,,704610,153,JUAN MANUEL DE BLANES,2020-11-29 17:32:36,"383 BLANES, JUAN MANUEL",-34.630778,-58.362066,346,PLAZA ZAPIOLA,2020-11-29 18:29:27,2001 DONADO,-34.573522,-58.474635,2020-11-29 17:32:36,2020-11-29,0,False,17,56,M,0,12.12,XL,0.216429
814437,,5872,,620511,179,CASA SAN,2020-01-19 16:41:12,OLAVARRÍA 811,-34.638480,-58.364284,346,PLAZA ZAPIOLA,2020-01-19 18:19:04,2001 DONADO,-34.573522,-58.474635,2020-01-19 16:41:12,2020-01-19,0,False,16,97,L,0,12.42,XL,0.128041
815372,,3444,,704610,108,USINA DEL ARTE,2020-11-08 16:04:40,"CAFFARENA, AGUSTIN R. & DON PEDRO DE MENDOZA AV.",-34.628759,-58.356254,346,PLAZA ZAPIOLA,2020-11-08 17:02:04,2001 DONADO,-34.573522,-58.474635,2020-11-08 16:04:40,2020-11-08,0,False,16,57,M,0,12.47,XL,0.218772
960570,,3681,,170841,346,PLAZA ZAPIOLA,2020-11-30 19:31:05,2001 DONADO,-34.573522,-58.474635,126,MINISTERIO DE JUSTICIA Y SEGURIDAD,2020-11-30 20:32:26,REGIMIENTO DE PATRICIOS AV. & ARAOZ DE LAMADRI...,-34.640269,-58.369220,2020-11-30 19:31:05,2020-11-30,1,False,19,61,ML,0,12.18,XL,0.199672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3032282,10028182,3525,NORMAL,587398,79,AZUCENA VILLAFLOR,2021-02-05 21:12:26,CODE,,,281,VILLA URQUIZA,2021-02-05 22:11:11,,,,2021-02-05 21:12:26,2021-02-05,5,True,21,58,M,0,12.06,XL,0.207931
3044507,10014553,3276,NORMAL,587398,151,AIME PAINÉ,2021-02-04 21:08:37,CODE,,,281,VILLA URQUIZA,2021-02-04 22:03:13,,,,2021-02-04 21:08:37,2021-02-04,4,False,21,54,M,0,12.29,XL,0.227593
3217282,9814784,3771,NORMAL,732306,399,GARCIA DEL RIO,2021-01-17 17:30:34,CODE,,,79,AZUCENA VILLAFLOR,2021-01-17 18:33:25,,,,2021-01-17 17:30:34,2021-01-17,0,False,17,62,ML,0,12.40,XL,0.200000
3277870,9743882,4364,NORMAL,653517,196,HOSPITAL ARGERICH,2021-01-10 20:05:15,CODE,,,399,GARCIA DEL RIO,2021-01-10 21:17:59,,,,2021-01-10 20:05:15,2021-01-10,0,False,20,72,ML,0,13.32,XL,0.185000


In [None]:
column = derivados["recorridos"]["df"][ derivados["recorridos"]["df"]['velocidad_kmm'] != np.inf ]["velocidad_kmm"].dropna()
max_value = column.max()
mean_value = column.mean()
print(max_value)
print(mean_value)
derivados["recorridos"]["df"]["velocidad_kmm"].mean()

1.505
0.1250904710390281


inf

In [None]:
vel_cortes    = [  0,      0.05,     0.1,    0.15,    0.2,     0.25,     0.3,         0.5,        0.7,      1 ]
vel_etiquetas = [ 'slento','lento','normal','nrapido','rapido','xrapido','xxrapido', 'xxxrapido', 'mrapido'   ]
derivados["recorridos"]["df"]["velocidad_cat"] = pd.cut( 
    derivados["recorridos"]["df"].velocidad_kmm,
    vel_cortes,
    include_lowest = True,
    labels = vel_etiquetas,
    ordered = True,
    # right = False,
)
derivados["recorridos"]["df"]['velocidad_cat'].unique()

['nrapido', 'normal', 'lento', 'rapido', 'slento', NaN, 'xrapido', 'xxrapido', 'xxxrapido', 'mrapido']
Categories (9, object): ['slento' < 'lento' < 'normal' < 'nrapido' ... 'xrapido' < 'xxrapido' <
                         'xxxrapido' < 'mrapido']

In [None]:
derivados["recorridos"]["df"][derivados["recorridos"]["df"]['velocidad_cat'] == 'slento']

Unnamed: 0,id,duracion,estado_cerrado,usuario_id,origen_id,origen_nombre,origen_fecha,origen_direccion,origen_lat,origen_lon,destino_id,destino_nombre,destino_fecha,destino_direccion,destino_lat,destino_lon,origen_datetime,fecha,wdia,dhabil,hora,duracion_mins,duracion_cat,vuelta,distancia,distancia_cat,velocidad_kmm,velocidad_cat
19,,4203,,695345,116,HOSPITAL ALEMÁN,2020-11-08 16:15:49,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2020-11-08 17:25:52,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-11-08 16:15:49,2020-11-08,0,False,16,70,ML,0,2.43,S,0.034714,slento
35,,3274,,199293,116,HOSPITAL ALEMÁN,2020-10-12 18:34:11,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2020-10-12 19:28:45,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-10-12 18:34:11,2020-10-12,1,False,18,54,M,0,2.43,S,0.045000,slento
66,,3167,,648203,116,HOSPITAL ALEMÁN,2020-02-04 13:30:38,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2020-02-04 14:23:25,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-02-04 13:30:38,2020-02-04,2,False,13,52,M,0,2.43,S,0.046731,slento
307,,3429,,706730,59,CORONEL DIAZ,2020-10-25 15:18:01,GUEMES 3330 & CORONEL DIAZ,-34.589968,-58.411490,142,ARMENIA Y GORRITI,2020-10-25 16:15:10,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-10-25 15:18:01,2020-10-25,0,False,15,57,M,0,1.61,XS,0.028246,slento
345,,2625,,340008,59,CORONEL DIAZ,2020-08-05 18:55:09,GUEMES 3330 & CORONEL DIAZ,-34.589968,-58.411490,142,ARMENIA Y GORRITI,2020-08-05 19:38:54,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-08-05 18:55:09,2020-08-05,3,False,18,43,SM,0,1.61,XS,0.037442,slento
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3362522,9646439,337,NORMAL,447308,8,CONGRESO,2021-01-01 00:36:14,CODE,,,8,CONGRESO,2021-01-01 00:41:51,,,,2021-01-01 00:36:14,2021-01-01,5,True,0,5,XXS,1,0.00,,0.000000,slento
3362536,9646425,2769,NORMAL,574671,350,PLAZA IRLANDA,2021-01-01 00:30:14,CODE,,,384,PLAZA BENITO NAZAR,2021-01-01 01:16:23,,,,2021-01-01 00:30:14,2021-01-01,5,True,0,46,M,0,1.49,XS,0.032391,slento
3362543,9646416,2191,NORMAL,47133,25,PLAZA GUEMES,2021-01-01 00:25:05,CODE,,,25,PLAZA GUEMES,2021-01-01 01:01:36,,,,2021-01-01 00:25:05,2021-01-01,5,True,0,36,SM,1,0.00,,0.000000,slento
3362549,9646409,77,NORMAL,42585,272,PLAZA BRUNO GIORDANO,2021-01-01 00:22:42,CODE,,,272,PLAZA BRUNO GIORDANO,2021-01-01 00:23:59,,,,2021-01-01 00:22:42,2021-01-01,5,True,0,1,XXS,1,0.00,,0.000000,slento



### CLIMA
https://www.smn.gob.ar/descarga-de-datos
https://www.smn.gob.ar/datos-abiertos-smn


### Regular sequences: pd.date_range()

Regular date sequences 

In [None]:
# Lista de dias a obtener
sdate = date(2020,1,1)  # start date
today = datetime.date.today()
yesterday = today - datetime.timedelta( days = 1 )
dias = pd.date_range(
  sdate, 
  yesterday-timedelta( days  1 ),
  freq='d'
)

### Observaciones Horarias
```
$ curl -v --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=observaciones/datohorario20210518.txt 2>&1 | grep AERO
!curl --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=observaciones/datohorario20210518.txt | head -n 1
!curl --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=observaciones/datohorario20210518.txt | grep AEROPARQUE

```

In [None]:
!curl --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=observaciones/datohorario20210518.txt | grep AEROPARQUE

FECHA     HORA  TEMP   HUM   PNM    DD    FF     NOMBRE                                             
         [HOA]  [�C]   [%]  [hPa]  [gr] [km/hr]                                                     


#### Pronostico
```
curl --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=pron5d/pron20210101.txt | head -n 52
curl --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=pron5d/pron20210101.txt | head -n 18
curl --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=pron5d/pron20210101.txt | head -n 18 | tail -n 8 | awk '/1/ -F ":" {print $1,$2,$7}'
```

In [None]:
!curl --silent https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=pron5d/pron20210101.txt | head -n 18 | tail -n 8 | awk '/1/ -F ":" {print $1,$2,$7}'

01/ENE/2021 00Hs. 0.0
01/ENE/2021 03Hs. 0.0
01/ENE/2021 06Hs. 0.0
01/ENE/2021 09Hs. 0.0
01/ENE/2021 12Hs. 0.0
01/ENE/2021 15Hs. 0.0
01/ENE/2021 18Hs. 0.0
01/ENE/2021 21Hs. 0.0


In [None]:
""" Obtener datos horarios para cada dia  """
dh_formato = '%d%m%Y%H'
prefijo = 'https://ssl.smn.gob.ar/dpd/descarga_opendata.php?file=observaciones/datohorario'
sufijo = '.txt'
for i, d in enumerate(dias):
  dstr = d.strftime('%Y%m%d') 
  dato_horario = prefijo + dstr + sufijo
  req = urllib.request.Request( dato_horario )
  response = urllib.request.urlopen( req )
  lines = response.readlines()
  fecha = d.strftime('%Y-%m-%d')
  print(i, 'procesando', dato_horario)
  # encabezados = lines[0].decode("utf-8").split()  
  # ['FECHA', 'HORA', 'TEMP', 'HUM', 'PNM', 'DD', 'FF', 'NOMBRE']
  # [b'[HOA]', b'[\xbaC]', b'[%]', b'[hPa]', b'[gr]', b'[km/hr]']
  # fstl = lines[2].decode("utf-8").split()  
  # print( encabezados )
  # print( fstl )
  
  for l in lines:
    if b'AEROPARQUE' in l:
      row =  l.decode("utf-8").split()
      # dh = str(row[0]) + str(row[1])
      # dh_dt = pd.to_datetime(
      #   dh, 
      #   format = dh_formato
      # ) 
      dh_dic = {
        # 'DH_DT'   : dh_dt,
        'FECHA'   : fecha,
        'HORA'    : row[1],
        'TEMP'    : row[2], 
        'HUM'     : row[3], 
        'PNM'     : row[4], 
        'DD'      : row[5], 
        'FF'      : row[6], 
        # 'NOMBRE'  : row[7]
      }
      derivados["clima"]["df"] = derivados["clima"]["df"].append(
        dh_dic,
        ignore_index=True
      )

In [None]:
# CARGAR CLIMA
derivados["clima"]["df"] = pd.read_csv(
  derivados["clima"]["local"],
)
derivados["clima"]["df"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13082 entries, 0 to 13081
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   DD      13082 non-null  object 
 1   FECHA   13082 non-null  object 
 2   FF      13082 non-null  object 
 3   HORA    13082 non-null  int64  
 4   HUM     13082 non-null  float64
 5   PNM     13082 non-null  float64
 6   TEMP    13082 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 715.5+ KB


In [None]:
# formato = '%Y-%m-%d'
# derivados["clima"]["df"]['DATE'] = pd.to_datetime(
#   derivados["clima"]["df"]['DATE'], 
#   format = formato
# )
# # derivados["clima"]["df"]['TAVG'].unique()
# derivados["clima"]["df"]['TEMP'].unique()

In [None]:
derivados["recorridos"]["df"] = pd.merge(
    left     = derivados["recorridos"]["df"],
    right    = derivados["clima"]["df"],
    how      = 'left', 
    left_on  = ['fecha','hora'], 
    right_on = ['FECHA','HORA']
)
derivados["recorridos"]["df"]

Unnamed: 0,id,duracion,estado_cerrado,usuario_id,origen_id,origen_nombre,origen_fecha,origen_direccion,origen_lat,origen_lon,destino_id,destino_nombre,destino_fecha,destino_direccion,destino_lat,destino_lon,origen_datetime,fecha,wdia,dhabil,hora,duracion_mins,duracion_cat,vuelta,distancia,distancia_cat,velocidad_kmm,velocidad_cat,DD,FECHA,FF,HORA,HUM,PNM,TEMP
0,,882,,666202,116,HOSPITAL ALEMÁN,2020-12-31 23:54:22,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2021-01-01 00:09:04,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-12-31 23:54:22,2020-12-31,4,False,23,14,XS,0,2.43,S,0.173571,nrapido,320,2020-12-31,17,23.0,45.0,1013.2,22.7
1,,894,,332689,116,HOSPITAL ALEMÁN,2020-12-29 12:39:18,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2020-12-29 12:54:12,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-12-29 12:39:18,2020-12-29,2,False,12,14,XS,0,2.43,S,0.173571,nrapido,50,2020-12-29,11,12.0,70.0,1008.6,26.4
2,,1361,,57949,116,HOSPITAL ALEMÁN,2020-12-21 21:21:38,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2020-12-21 21:44:19,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-12-21 21:21:38,2020-12-21,1,False,21,22,S,0,2.43,S,0.110455,normal,90,2020-12-21,13,21.0,71.0,1016.3,21.1
3,,1010,,454846,116,HOSPITAL ALEMÁN,2020-12-20 21:58:19,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2020-12-20 22:15:09,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-12-20 21:58:19,2020-12-20,0,False,21,16,S,0,2.43,S,0.151875,nrapido,200,2020-12-20,7,21.0,38.0,1014.0,20.5
4,,1490,,690975,116,HOSPITAL ALEMÁN,2020-12-20 13:32:53,BERUTI & ECUADOR,-34.592171,-58.402586,142,ARMENIA Y GORRITI,2020-12-20 13:57:43,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-12-20 13:32:53,2020-12-20,0,False,13,24,S,0,2.43,S,0.101250,normal,230,2020-12-20,20,13.0,39.0,1012.1,20.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3362562,9646396,690,NORMAL,62246,292,PLAZA BOLIVIA,2021-01-01 00:09:43,CODE,,,44,ECOPARQUE,2021-01-01 00:21:13,,,,2021-01-01 00:09:43,2021-01-01,5,True,0,11,XS,0,2.43,S,0.220909,rapido,320,2021-01-01,15,0.0,44.0,1013.5,22.0
3362563,9646395,1360,NORMAL,445201,79,AZUCENA VILLAFLOR,2021-01-01 00:08:40,CODE,,,168,ESTADOS UNIDOS,2021-01-01 00:31:20,,,,2021-01-01 00:08:40,2021-01-01,5,True,0,22,S,0,1.75,XS,0.079545,lento,320,2021-01-01,15,0.0,44.0,1013.5,22.0
3362564,9646394,1169,NORMAL,554162,79,AZUCENA VILLAFLOR,2021-01-01 00:08:37,CODE,,,8,CONGRESO,2021-01-01 00:28:06,,,,2021-01-01 00:08:37,2021-01-01,5,True,0,19,S,0,2.35,S,0.123684,normal,320,2021-01-01,15,0.0,44.0,1013.5,22.0
3362565,9646393,1031,NORMAL,51005,79,AZUCENA VILLAFLOR,2021-01-01 00:06:50,CODE,,,75,PLAZA PRIMERO DE MAYO,2021-01-01 00:24:01,,,,2021-01-01 00:06:50,2021-01-01,5,True,0,17,S,0,3.22,S,0.189412,nrapido,320,2021-01-01,15,0.0,44.0,1013.5,22.0


In [None]:
derivados["recorridos"]["df"] = pd.merge(
    left     = derivados["recorridos"]["df"],
    right    = insumos["clima"]["df"],
    how      = 'left', 
    left_on  = 'fecha', 
    right_on = 'DATE'
)

In [None]:
orden = [
#  'id',
#  'estado_cerrado',
  'usuario_id',
  'origen_id',
  'origen_nombre',
  # 'origen_fecha',
  # 'origen_direccion',
  # 'origen_lon',
  # 'origen_lat',
  'destino_id',
  'destino_nombre',
#  'destino_fecha',
#  'destino_direccion',
#  'destino_lon',
#  'destino_lat',
  'vuelta', 
  'duracion',
  'duracion_mins',
  'duracion_cat',
  'distancia',
  'distancia_cat',
  'velocidad_kmm',
  'velocidad_cat',
  'origen_datetime',
  'fecha',
  'wdia',
  'hora',
  'dhabil',
  'DD',
  'FF',
  'HUM',
  'PNM',
  'TEMP',
  'PRCP',
  'TAVG',
#  'FECHA',
#  'HORA',
#  STATION	NAME	LATITUDE	lonITUDE	ELEVATION	DATE
#  PRCP_ATTRIBUTES
#  TAVG_ATTRIBUTES	TMAX	TMAX_ATTRIBUTES	TMIN	TMIN_ATTRIBUTES                                                
]
derivados["recorridos"]["df"] = derivados["recorridos"]["df"][ orden ]
derivados["recorridos"]["df"]

Unnamed: 0,usuario_id,origen_id,origen_nombre,destino_id,destino_nombre,vuelta,duracion,duracion_mins,duracion_cat,distancia,distancia_cat,velocidad_kmm,velocidad_cat,origen_datetime,fecha,wdia,hora,dhabil,DD,FF,HUM,PNM,TEMP,PRCP,TAVG
0,666202,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,882,14,XS,2.43,S,0.173571,nrapido,2020-12-31 23:54:22,2020-12-31,4,23,False,320,17,45.0,1013.2,22.7,,22.3
1,332689,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,894,14,XS,2.43,S,0.173571,nrapido,2020-12-29 12:39:18,2020-12-29,2,12,False,50,11,70.0,1008.6,26.4,0.0,26.1
2,57949,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,1361,22,S,2.43,S,0.110455,normal,2020-12-21 21:21:38,2020-12-21,1,21,False,90,13,71.0,1016.3,21.1,,19.6
3,454846,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,1010,16,S,2.43,S,0.151875,nrapido,2020-12-20 21:58:19,2020-12-20,0,21,False,200,7,38.0,1014.0,20.5,,18.6
4,690975,116,HOSPITAL ALEMÁN,142,ARMENIA Y GORRITI,0,1490,24,S,2.43,S,0.101250,normal,2020-12-20 13:32:53,2020-12-20,0,13,False,230,20,39.0,1012.1,20.6,,18.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3362562,62246,292,PLAZA BOLIVIA,44,ECOPARQUE,0,690,11,XS,2.43,S,0.220909,rapido,2021-01-01 00:09:43,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3
3362563,445201,79,AZUCENA VILLAFLOR,168,ESTADOS UNIDOS,0,1360,22,S,1.75,XS,0.079545,lento,2021-01-01 00:08:40,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3
3362564,554162,79,AZUCENA VILLAFLOR,8,CONGRESO,0,1169,19,S,2.35,S,0.123684,normal,2021-01-01 00:08:37,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3
3362565,51005,79,AZUCENA VILLAFLOR,75,PLAZA PRIMERO DE MAYO,0,1031,17,S,3.22,S,0.189412,nrapido,2021-01-01 00:06:50,2021-01-01,5,0,True,320,15,44.0,1013.5,22.0,,25.3


In [None]:
# recorridos_clima 
derivados["recorridos"]["df"].to_csv(
  derivados["recorridos"]["local"],
  index = False,
)

In [None]:
derivados["recorridos"]["df"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3362567 entries, 0 to 3362566
Data columns (total 25 columns):
 #   Column           Dtype         
---  ------           -----         
 0   usuario_id       int64         
 1   origen_id        int64         
 2   origen_nombre    object        
 3   destino_id       int64         
 4   destino_nombre   object        
 5   vuelta           int64         
 6   duracion         int64         
 7   duracion_mins    int64         
 8   duracion_cat     category      
 9   distancia        float64       
 10  distancia_cat    category      
 11  velocidad_kmm    float64       
 12  velocidad_cat    category      
 13  origen_datetime  datetime64[ns]
 14  fecha            object        
 15  wdia             int64         
 16  hora             int64         
 17  dhabil           bool          
 18  DD               object        
 19  FF               object        
 20  HUM              float64       
 21  PNM              float64       

In [None]:
# GRUPING USERS
usurios = derivados["recorridos"]["df"].groupby(
    ['usuario_id']
).agg(
  salidas = ( 'destino_id',  'count'),
  vueltas = ( 'vuelta',  'sum'),
  velocidad = ( 'velocidad_kmm',  'max'),
  distancia = ( 'distancia',  'max')
)
# usurios = usurios.pivot_table(
#     index   = [ 'usuario_id'],
#     columns = "salidas",
#     values  = "salidas",
#     # aggfunc = 'count',
# ).sort_values('origen_id')
# demanda_diaria # comprobar

In [None]:
usurios = usurios[ ~usurios.isin([np.nan, np.inf, -np.inf]).any(1) ]
# usurios
usurios.sort_values( 'velocidad' , ascending = False).head(30)

Unnamed: 0_level_0,salidas,vueltas,velocidad,distancia
usuario_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
759740,5,0,1.505,6.02
768677,5,4,1.505,6.02
689454,172,5,1.03,5.03
666201,160,9,1.03,4.99
69597,13,0,1.03,5.76
635848,31,4,1.03,2.46
322243,28,4,1.03,1.14
1974,36,2,1.03,4.59
543803,91,12,1.03,6.25
480099,7,0,1.03,1.03


# Analisis Exploratorio de Datos Espaciales



In [None]:
demanda_viz = derivados["recorridos"]["df"]
# Fitro x dia

demanda_viz = demanda_viz[ demanda_viz["fecha"] == '2020-01-01' ]
demanda_viz

Unnamed: 0,id,duracion,estado_cerrado,usuario_id,origen_id,origen_nombre,origen_fecha,origen_direccion,origen_lat,origen_lon,destino_id,destino_nombre,destino_fecha,destino_direccion,destino_lat,destino_lon,origen_datetime,fecha,wdia,dhabil,hora,duracion_mins,duracion_cat,vuelta
1428,,1484,,625808,292,PLAZA BOLIVIA,2020-01-01 20:34:11,OLLEROS AV. & DEL LIBERTADOR AV.,-34.563539,-58.436113,142,ARMENIA Y GORRITI,2020-01-01 20:58:55,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-01-01 20:34:11,2020-01-01,3,False,20,24,S,0
1429,,5352,,269625,292,PLAZA BOLIVIA,2020-01-01 18:14:09,OLLEROS AV. & DEL LIBERTADOR AV.,-34.563539,-58.436113,142,ARMENIA Y GORRITI,2020-01-01 19:43:21,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-01-01 18:14:09,2020-01-01,3,False,18,89,ML,0
1744,,838,,379084,136,ACEVEDO,2020-01-01 01:59:47,ACEVEDO 220 Y PADILLA,-34.599659,-58.442684,142,ARMENIA Y GORRITI,2020-01-01 02:13:45,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-01-01 01:59:47,2020-01-01,3,False,1,13,XS,0
2103,,1111,,154973,159,FITZ ROY & GORRITI,2020-01-01 00:55:13,FITZ ROY & GORRITI,-34.584950,-58.437337,142,ARMENIA Y GORRITI,2020-01-01 01:13:44,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-01-01 00:55:13,2020-01-01,3,False,0,18,S,0
2565,,851,,227446,134,DON BOSCO,2020-01-01 19:07:08,MAZA Y DON BOSCO,-34.612231,-58.416777,142,ARMENIA Y GORRITI,2020-01-01 19:21:19,ARMENIA 1525 Y GORRITI,-34.590541,-58.428972,2020-01-01 19:07:08,2020-01-01,3,False,19,14,XS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002258,,4148,,831,179,CASA SAN,2020-01-01 11:57:08,OLAVARRÍA 811,-34.638480,-58.364284,179,CASA SAN,2020-01-01 13:06:16,OLAVARRÍA 811,-34.638480,-58.364284,2020-01-01 11:57:08,2020-01-01,3,False,11,69,ML,1
2002259,,4897,,483248,179,CASA SAN,2020-01-01 02:52:31,OLAVARRÍA 811,-34.638480,-58.364284,179,CASA SAN,2020-01-01 04:14:08,OLAVARRÍA 811,-34.638480,-58.364284,2020-01-01 02:52:31,2020-01-01,3,False,2,81,ML,1
2002260,,2060,,578343,179,CASA SAN,2020-01-01 01:53:51,OLAVARRÍA 811,-34.638480,-58.364284,179,CASA SAN,2020-01-01 02:28:11,OLAVARRÍA 811,-34.638480,-58.364284,2020-01-01 01:53:51,2020-01-01,3,False,1,34,SM,1
2002767,,507,,578133,108,USINA DEL ARTE,2020-01-01 11:16:18,"CAFFARENA, AGUSTIN R. & DON PEDRO DE MENDOZA AV.",-34.628759,-58.356254,179,CASA SAN,2020-01-01 11:24:45,OLAVARRÍA 811,-34.638480,-58.364284,2020-01-01 11:16:18,2020-01-01,3,False,11,8,XS,0


In [None]:
demanda_viz = derivados["recorridos"]["df"]
# Fitro x dia
fecha_viz = '2020-03-01'
demanda_viz = demanda_viz[ demanda_viz["fecha"] == fecha_viz ]
demanda_viz = demanda_viz.groupby(
    ['origen_nombre', 'origen_id', 'hora', 'origen_lon','origen_lat']
).agg(
  salidas = ( 'hora',  'count' ),
  nombre = ( 'origen_nombre',  'first' ),
  id = ( 'origen_id',  'first' ),
  lon = ( 'origen_lon',  'first' ),
  lat= ( 'origen_lat',  'first' ),
)
# ).reset_index()
# df.set_index(['id', 'date', 'location'], append=True)
demanda_viz = demanda_viz.pivot_table(
    index = ['nombre',  'id', 'lon','lat'],
    columns = "hora",
    values = "salidas",
    fill_value = 0,
    # aggfunc = 'sum',
).sort_values('id')
demanda_viz

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hora,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
nombre,id,lon,lat,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
FACULTAD DE DERECHO,1,-58.390598,-34.583749,0,2,2,1,0,0,0,0,0,0,0,1,1,0,3,5,3,3,0,3,3,0,3,1
RETIRO I,2,-58.374711,-34.592424,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0
ADUANA,3,-58.368256,-34.611033,0,0,0,0,0,0,2,3,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0
PLAZA ROMA,4,-58.368777,-34.601823,0,0,0,0,5,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,1,1,0
PLAZA ITALIA,5,-58.420952,-34.580550,3,0,0,1,1,0,1,2,0,1,0,0,5,5,1,1,1,1,0,7,4,5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUNA,387,-58.399647,-34.638586,0,2,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
PLAZA REPÚBLICA DE ECUADOR,391,-58.420705,-34.565203,0,1,0,0,0,0,0,0,0,1,1,1,1,0,0,6,1,3,5,1,0,2,0,0
BARRIO 31,393,-58.378858,-34.582422,1,0,0,0,0,0,0,1,1,0,1,1,0,1,1,0,0,0,0,2,4,1,0,0
PLAZA ALBERTI,396,-58.458537,-34.555080,3,0,1,0,0,0,1,0,1,1,1,5,5,0,1,1,2,4,0,0,5,3,0,1


## Exploratory Spatial Data Analysis (ESDA)

https://alcidanalytics.com/p/geographic-heatmap-in-python
https://alysivji.github.io/getting-started-with-folium.html
https://medium.com/analytics-vidhya/measure-driving-distance-time-and-plot-routes-between-two-geographical-locations-using-python-39995dfea7e

In [None]:
import folium
from folium import plugins
# import pandas as pd
# import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [183]:
#... where m is my map object. And 5 is the time (seconds) to render the map.
# https://stackoverflow.com/questions/53565979/export-a-folium-map-as-a-png
# https://blog.ouseful.info/2015/12/15/grabbing-screenshots-of-folium-produced-choropleth-leaflet-maps-using-selenium/

import io
# !pip install selenium
# from PIL import Image

import os
import time
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
binary = FirefoxBinary('path/to/installed firefox binary')
browser = webdriver.Firefox(firefox_binary=binary)

# del m
m = folium.Map(
  [ -34.606499,-58.381094 ],
  zoom_start = 14
)
# del m
delay=5
 
# Save the map as an HTML file
fn='testmap.html'
tmpurl='file://{path}/{mapfile}'.format(path=os.getcwd(),mapfile=fn)
m.save(fn)
 
#Open a browser window...
browser = webdriver.Firefox()
#..that displays the map...
browser.get(tmpurl)
#Give the map tiles some time to load
time.sleep(delay)
#Grab the screenshot
browser.save_screenshot('map.png')
#Close the browser
browser.quit()

    # img_data = m._to_png(5)
    # img = Image.open(io.BytesIO(img_data))
    # img.save(fecha_viz+_h_+'map.png'


WebDriverException: ignored

In [182]:
for h in demanda_viz.columns.to_list():
  #  del m
  m = folium.Map(
    [ -34.606499,-58.381094 ],
    zoom_start = 14
  )
  demandas = demanda_viz[h]
  estaciones = demanda_viz[h].reset_index()
  for e in estaciones.iterrows():
    estacion = e[1]
    # print(
    #     'hora',h,
    #     estacion['id'],
    #     estacion['nombre'],
    #     estacion['lat'],
    #     estacion['lon'],
    #     estacion[h]
    # )
    folium.CircleMarker(
        [ estacion['lat'], estacion['lon'] ],
        radius= estacion[h] * 1, 
        popup = "{0}\:{1}\demanda:{2}".format(
            estacion['id'],
            estacion['nombre'],
            str(estacion[h]),
        ),
        fill_color = "#{0:02x}{1:02x}{2:02x}".format(
            mapear( int(estacion[h]) ),
            255 - mapear( int(estacion[h]) ) ,
            255 - mapear( int(estacion[h]) )
        )
    ).add_to(m)
    # img_data = m._to_png(5)
    # img = Image.open(io.BytesIO(img_data))
    # img.save(fecha_viz + '_' + h + '_' + 'map.png')

WebDriverException: ignored

In [185]:
import math
def mapear(x): 
  return int( (x / 30295 ) * 255) 
m = folium.Map(
  [ -34.606499,-58.381094 ],
  zoom_start = 14
)
#  mark each station as a point
for e in derivados["estaciones"]["df"].iterrows():
  folium.CircleMarker(
      [ estacion['lat'], estacion['lon'] ],
      radius= estacion['salidas'] * 0.002, 
      popup = "{0}\nsalidas:{1}\narribos:{2}".format(
          estacion['nombre'],
          str(estacion['salidas']),
          str(estacion['arribos'])
      ),
      fill_color = "#{0:02x}{1:02x}{2:02x}".format(
          mapear( int(estacion['salidas']) ),
          255 - mapear( int(estacion['salidas']) ) ,
          255 - mapear( int(estacion['salidas']) )
      )
  ).add_to(m)
m

TypeError: ignored

## reserva

In [None]:
# # convert to (n, 2) nd-array format for heatmap
# # stationArr = derivados["estaciones"]["df"][['lat', 'lon']].as_matrix()
# # https://stackoverflow.com/a/63293803
# stationArr = derivados["estaciones"]["df"][['salidas']].values

# # plot heatmap
# m.add_children(plugins.HeatMap(stationArr, radius=15))
# m

In [None]:
"""
We will create a helper function with default keyword arguments to abstact away
Folium's complexity. This leaves us with a simple API we can use going forward.
https://alysivji.github.io/getting-started-with-folium.html
"""
def map_points(
    df,
    lat_col = 'lat',
    lon_col = 'lon',
    zoom_start = 11, 
    plot_points = False,
    pt_radius = 15, 
    draw_heatmap = False,
    heat_map_weights_col = None, 
    heat_map_weights_normalize = False,
    heat_map_radius = 15,
    popup_name = 'nombre'
):
  """
  Creates a map given a dataframe of points. 
  Can also produce a heatmap overlay

  Arg:
    df: dataframe containing points to maps
    lat_col: Column containing latitude (string)
    lon_col: Column containing lonitude (string)
    zoom_start: Integer representing the initial zoom of the map
    plot_points: Add points to map (boolean)
    pt_radius: Size of each point
    draw_heatmap: Add heatmap to map (boolean)
    heat_map_weights_col: Column containing heatmap weights
    heat_map_weights_normalize: Normalize heatmap weights (boolean)
    heat_map_radius: Size of heatmap point
    popup_name: name of the popup

  Returns:
      folium map object
  """

  ## center map in the middle of points center in
  middle_lat = df[lat_col].median()
  middle_lon = df[lon_col].median()

  curr_map = folium.Map(
      location=[middle_lat, middle_lon],
      zoom_start=zoom_start
  )
  cols_to_pull = [ lat_col, lon_col, heat_map_weights_col ]
  # add points to map
  if plot_points:
    for _, row in df.iterrows():
      folium.CircleMarker(
        [ row[lat_col], row[lon_col] ],
        radius = pt_radius,
        popup = row[ popup_name ],
        fill_color = "#3db7e4", # divvy color
      ).add_to( curr_map )
  

  # add heatmap
  if draw_heatmap:
    # convert to (n, 2) or (n, 3) matrix format
    if heat_map_weights_col is None:
      cols_to_pull = [lat_col, lon_col]
    else:
      # if we have to normalize
      if heat_map_weights_normalize:
        df[ heat_map_weights_col] = df[heat_map_weights_col] / df[heat_map_weights_col].sum()
        cols_to_pull = [ lat_col, lon_col, heat_map_weights_col ]
      stations = df[cols_to_pull].values
      curr_map.add_children(
          plugins.HeatMap(
              stations,
              radius = heat_map_radius
          )
      )
  return curr_map

In [None]:
# del m
m = map_points(
    derivados["estaciones"]["df"],
    plot_points = False,
    draw_heatmap = True,
    heat_map_weights_col = 'salidas',
    heat_map_weights_normalize = True,
)
m

In [None]:
derivados["recorridos"]["df"].info()

In [None]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
from google.colab import drive
drive.mount('/drive')

In [None]:
# Fetch weather from drive
id = '14BpBfjQ3IrfeK8CajCWrEdfPg7S0eqc8'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('weather-data-ba.csv') 
weather_data_df = pd.read_csv('weather-data-ba.csv')
weather_data_df.head()
weather_data_df = weather_data_df.filter(
    ['DATE','PRCP','TAVG']
)
weather_data_df.isnull().sum( axis = 0 )
weather_data_df['PRCP'] = weather_data_df['PRCP'].replace( np.nan, 0 )
weather_data_df.head()

In [None]:
bicicletas["recorridos_clima_df"].to_csv(
    '/drive/My Drive/Datasets-bicing/recorridos-clima-2020-2021.csv'
)

In [None]:
bicicletas["recorridos_clima_df"].head()

## Bajar el ultimo
Bajar el dataset limpio desde drive

In [None]:
from google.colab import files
files.download('recorridos_2020-2021.zip')

In [None]:
bicicletas["recorridos_df"].to_csv(
    '/drive/My Drive/Datasets-bicing/recorridos_2020-2021.csv'
)

In [None]:
id = '17liAPD1lyj2wT-Hcbh4ed9-Cn4peoyIq'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('recorridos_2020-2021.csv') 
bicicletas["recorridos_df"] = pd.read_csv('recorridos_2020-2021.csv')
bicicletas["recorridos_df"].head()