# Métodos de Representación y Visualización Espacial HHDD

Curso: 2021-22

Profesor: Manuél Díaz Ordoñez

## Objetivos

Lista de objetivos y funcionalidades

Estudiaremos el _**ambar**_

### Ejecutar el código
* datos de entrada
* datos de salida

## Código fuente

### Dependencias
* python

Librerías Python:
* notebook
* pandas
* geopy


### Librerías y funciones axuliares

In [221]:
import json
import re
import pandas as pd

from datetime import datetime
from fractions import Fraction
from geopy import geocoders  


def clean_port(port, dct):
    """
    Check a given port agaisnt the dictionary with name rules.
    Returns a corrected name if any rule applies
    
    :param port: port name, e.g. Ambsterdam
    :param dct: rule dictionary with naming regular expressions
    :returns: corrected port name based on rule dictionary
    """
    for key, regex in dct.items():
        if bool(re.search(regex, port)):
            return key
    return port


def filter_amber(x):
    """
    Check if string x matches the word "amber" in any of the
    supported languagles (Danish, Dutch and English)
    
    :param: x string
    :returns: True if it maches "amber" in Danish, Dutch or English
    """
    # rule for English
    re_amber = r'ambe'
    # rule for Danish and Dutch
    re_bersteen = r'b.rnste.*n'
    return bool(re.search(re_amber, x.lower())) or bool(re.search(re_bersteen, x.lower()))


def geolocate_ports(ports):
    """
    Calculate geographical coordinates for a list of cities (strings).
    
    :param ports: list of strings
    :returns: dictionary where each city (key) has assciated its coordinates
              as a tuple. If a city is not found, coordinates are (None, None)
              
    :example:
        ports = ['Amsterdam', 'Rotterdam']
        coor = geolocate_ports(ports)
        print(coor)
        > {
        >   'Amsterdam': (x1, y1),
        >   'Rotterdam': (x2, y2)
        > }
    """
    gn = geocoders.Nominatim(user_agent='fakeusername')
    coordinates  = {}
    
    for port in ports:
        coor = gn.geocode(port)
        if coor is not None:
            coordinates[port] = (coor.latitude, coor.longitude)
        else:
            coordinates[port] = (None, None)
        
    return coordinates


def fraction_to_float(text):
    """
    Checks if a variable is a string containing a fraction expression
    (e.g. "1 1/2"). If so it calculates the equivalent float value.
    
    :param text: value to check
    :returns: float value
    """
    if isinstance(text, str):
        total = [Fraction(part) for part in text.split(' ')]
        return float(sum(total))
    else:
        return text
    

### Archivos CSV y campos asociados

In [191]:
# CSV files to be processed
file_cargo = 'data/cargoes_Advanced_search_results_values_B%rnste%n__1634_1857_.csv'
file_passage = 'data/passages_Advanced_search_results_values_B%rnste%n__1634_1857_.csv'

# translation NL-EN for cargo headers
fields_cargo = {
    'id_doorvaart': 'id',
    'van': 'from',
    'naar': 'to',
    'maat': 'unit', 
    'aantal': 'amount',
    'soort': 'type'
}
# list of NL cargo headers only
fields_cargo_nl = fields_cargo.keys()

# translation NL-EN for passage headers
fields_passage = {
    'id_doorvaart': 'id',
    'dag': 'day',
    'maand': 'month',
    'jaar': 'year'
}
# list of NL passage headers only
fields_passage_nl = fields_passage

### Nombres de puertos
* puertos a ignorar: no se corresponden con localizaciones concretas, otros motivos
* reglas para procesar y uniformar nombres de puertos (expresiones regulares)

In [None]:
# port names to be removed
ports_remove = ['-', 'Østersøen', 'Franckeriige', 'Habel de Graas', 'Westerwig']

ports_regex = {
    'Amsterdam': r'Amb?s.*',
    'Konigsberg': r'Cønn?i.+|K.nn?i?s?.+',
    'Danzig': r'Dan.+',
    'Dunkirk': r'D.nk.rchen',
    'Emden': r'Emb?den',
    'Gottenborg': r'Got.enborg',
    'Habel de Graas': r'Habel de Gra.s',
    'Helsingør': r'Helsingø.?r',
    'Hull': r'Hul?',
    'Copenhagen': r'K.øbenha..',
    'Landskrona': r'Landscrona',
    'Leeuwarden': r'L.ur?w.+en',
    'London': r'L[o|u]nn?d',
    'Montrose': r'Montrosse?',
    'Newcastle': r'Nycast.*',
    'Petersburg': r'Petersb.rg',
    'Rugenwalde': r'R.genwalde',
    'Stockholm': r'Stock.*',
    'Vedbæk': r'Wedbek',
    'Dundee': r'Dundie',
    'Trondheim': r'Tronhiem',
    'Ramsgate': r'Romansgate',
}

use_coordinates_cache = True

### Tabla de conversión de unidades locales de medida a Kilogramos

In [None]:
unit_to_kg = {
    # https://www.sizes.com/units/pund.htm
    'Pund': 0.5,
    # https://www.sizes.com/units/skippund.htm
    'Skippund': 160.076,
    # https://sv.wikipedia.org/wiki/Lispund
    # https://www.sizes.com/units/lispund.htm
    'Lispund': 8.003,
    # https://www.sizes.com/units/centner.htm
    'Centner': 100,
}

### Cargar datos desde archivos CSV

In [240]:
# cargoes
cargoes_raw = pd.read_csv(file_cargo, header=0, usecols=fields_cargo_nl, sep=';')
cargoes_raw = cargoes_raw.rename(columns=fields_cargo)

# passages
passages_raw = pd.read_csv(file_passage, header=0, usecols=fields_passage_nl, sep=';')
passages_raw = passages_raw.rename(columns=fields_passage)

### Construir campo fecha

Unimos los campos `year`, `month` y `day` en un único campo `date` (Fecha)

In [263]:
# create new column date combining several columns
passages_raw['date'] = passages_raw[['year', 'month', 'day']].apply(
    lambda x: datetime(x[0], x[1], x[2]),
    axis=1
)
# remove columns day, montgh and year
passages = passages_raw.drop(columns=['day', 'month', 'year'])

### Seleccionar carga de tipo "ámbar"

Filtramos el dataset `cargoes_raw` seleccionando únicamente las filas del tipo _ámbar_. La función `filter_amber` permite determinar si una cadena de texto se corresponde con la mercancía _ámbar_ en alguno de los idiomas soportados (danés, inglés y neerlándés).

In [264]:
# logic vector, indicates if fow is "amber" type or not
filter_vector = cargoes_raw['type'].apply(filter_amber)
# keep only rows of "amber" type
cargoes_raw = cargoes_raw[filter_vector]

### Procesar nombres de puertos

In [265]:
cargoes_raw['port_from'] = cargoes_raw[['from']].apply(lambda x: clean_port(x[0], ports_regex), axis=1)
cargoes_raw['port_to'] = cargoes_raw[['to']].apply(lambda x: clean_port(x[0], ports_regex), axis=1)
# remove original columns, not needed any more
cargoes = cargoes_raw.drop(['from', 'to'], axis=1)

### Filtrar unidades de medida soportadas

In [268]:
cargoes = cargoes[cargoes['unit'].isin(unit_to_kg.keys())]
cargoes = cargoes[~cargoes['port_from'].isin(ports_remove)]
cargoes = cargoes[~cargoes['port_to'].isin(ports_remove)]

### Procesar fracciones
Algunas cantidades están expresadas como fracciones (i.e. '1 3/4'). Convertimos esta representación a formato decimal.

In [269]:
# from fractions to float
cargoes['amount'] = cargoes['amount'].apply(fraction_to_float)
cargoes['kg'] = cargoes[['amount', 'unit']].apply(lambda x: x[0] * unit_to_kg[x[1]], axis=1)

### Agrupar datos por viaje
Algunos viajes presentan varias entradas para la mercancía _ámbar_ en distintas unidades de medida. Una vez que todas las unidades de medida han sido convertidas a kilogramos, es posible sumar estas cantidades, obteniendo el total de kilogramos transportados en un determinado viaje

In [271]:
# group by id, port_from and port_to, calculate sum of kg
# original units and amounts will be lost during the aggregation
cargoes_summary = cargoes.groupby(['id', 'port_from', 'port_to'])['kg'].sum()
# it is required to reset indexes
cargoes_summary = cargoes_summary.reset_index()

### Calcular / cargar coordenadas

Cálculo de coordenadas puede ser lento. Proporcionamos un dicionario precalculado.

Re-calcular coordendadas sólo requiere asigar valor `False` a la variable `use_coordinates_cache`.

In [273]:
# list of all the processed port names in cargoes dataset
ports_filtered = set.union(set(cargoes['port_from'].unique()), set(cargoes['port_to'].unique()))

if use_coordinates_cache:
    # load coordinates cache
    with open('./coordinates.json', 'r') as file_json:
        coordinates = json.load(file_json)
else:
    coordinates = geolocate_ports(list(ports_filtered))
    
cargoes_summary['lat_from'] = cargoes_summary['port_from'].apply(lambda x: coordinates[x][0])
cargoes_summary['lon_from'] = cargoes_summary['port_from'].apply(lambda x: coordinates[x][1])

cargoes_summary['lat_to'] = cargoes_summary['port_to'].apply(lambda x: coordinates[x][0])
cargoes_summary['lon_to'] = cargoes_summary['port_to'].apply(lambda x: coordinates[x][1])

### Generar dataset final

In [274]:
table = pd.merge(cargoes_summary, passages, on='id')

In [277]:
table.to_csv('amber.csv')

In [275]:
table

Unnamed: 0,id,port_from,port_to,kg,lat_from,lon_from,lat_to,lon_to,date
0,134935,Malmøe,Gottenborg,2.000,55.605293,13.000157,60.902828,11.806039,1782-05-29 00:00:00
1,221359,Liebau,Hamborg,160.076,49.876699,17.082672,53.550341,10.000654,1772-05-06 00:00:00
2,222683,Stettin,Amsterdam,24.000,53.429681,14.592913,52.372760,4.893604,1776-12-13 00:00:00
3,231440,Malmøe,Gottenborg,2.500,55.605293,13.000157,60.902828,11.806039,1779-07-14 00:00:00
4,238566,Rugenwalde,Amsterdam,150.000,54.429150,16.403846,52.372760,4.893604,1775-07-17 00:00:00
...,...,...,...,...,...,...,...,...,...
971,1752582,Danzig,Amsterdam,20.000,54.361193,18.628609,52.372760,4.893604,1700-11-08 00:00:00
972,5087883,Danzig,Amsterdam,16.500,54.361193,18.628609,52.372760,4.893604,1733-09-23 00:00:00
973,10000021,Danzig,London,159.000,54.361193,18.628609,51.507322,-0.127647,1751-05-23 00:00:00
974,10000315,Danzig,Amsterdam,90.000,54.361193,18.628609,52.372760,4.893604,1757-10-12 00:00:00
