In [155]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('../data/processed_data.tsv', sep='\t', encoding='utf8')

In [73]:
geo = pd.read_csv('../data/cities1000.tsv', sep='\t', encoding='utf8',
                  usecols=[0, 1, 3, 4, 5, 14],
                  names=['id', 'name', 'altname', 'latitude', 'longitude', 'population']).convert_dtypes()

In [156]:
with open('../data/placename_replacement_dict.json', 'r', encoding='utf8') as f:
    placename_repalcement_dict = json.load(f)

In [169]:
df.placename.replace(placename_repalcement_dict, inplace=True)

In [170]:
places = list(df.placename.value_counts()[lambda x: x >= 30].index)

In [99]:
def link_placename(placename):
    
    name_match = geo[geo.name == placename]
    
    if len(name_match) == 1:
        return name_match[['latitude', 'longitude']].to_numpy()[0]
    
    elif len(name_match) > 1:
        return name_match.loc[name_match[['population']].idxmax(), ['latitude', 'longitude']].to_numpy()[0]
    
    elif len(name_match) == 0: 
        alt_match = geo[geo.altname.str.contains(','+placename+',')]
        
        if len(alt_match) == 1:
            return alt_match[['latitude', 'longitude']].to_numpy()[0]
        
        if len(alt_match) > 1:
            return alt_match.loc[alt_match[['population']].idxmax(), ['latitude', 'longitude']].to_numpy()[0]
        
        elif len(alt_match) == 0:
            print(f'{placename} not found')
            return np.full((1,2), np.nan)[0]

In [171]:
coords = []

for name in tqdm(places):
    place_coords = link_placename(name)
    coords.append(place_coords)

  0%|          | 0/313 [00:00<?, ?it/s]

Domesnees not found
Main not found
Italien not found
Türkische Grenze not found
Alexandrien not found
Schweiz not found
Italienische Grenze not found
Niederelbe not found
Donau not found
Serbische Grenze not found
Spanische Grenze not found
Pera not found
Rhein not found
Sachsen not found
Kurland not found
Elberfeld not found
Sanssouci not found
Tuckum not found
Niederlanden not found
Helvoet not found
Cettinje not found
Leith not found
Livland not found
Ofen not found
Oesterreich not found
Gravenhaag not found
Kissingen not found
Zarskoje-Sselo not found
Brandenburgischen not found
Texel not found
Tirnowo not found
Ssaratow not found
Polnische Grenze not found
Etrl not found
Vlie not found
Preußen not found
Caprera not found
Cattaro not found
Sünderland not found
Frankreich not found
Möttau not found
Niederrhein not found
Nieder-Elbe not found
Oger not found


In [172]:
df_coords = pd.DataFrame(coords, columns=['latitude', 'longitude'], index=places)

In [173]:
df_coords

Unnamed: 0,latitude,longitude
Paris,48.85341,2.34880
London,51.50853,-0.12574
Berlin,52.52437,13.41053
St. Petersburg,27.77086,-82.67927
Wien,48.20849,16.37208
...,...,...
Taganrog,47.23627,38.90530
Melbourne,-37.81400,144.96332
Paderborn,51.71905,8.75439
Karlsbad,33.15809,-117.35059


In [134]:
def insert_coordinates(placename):
    print(placename)
    coords = input('Insert coordinates: ')
    latitude, longitude = [float(c) for c in coords.split(', ')]
    return np.array((latitude, longitude))

In [175]:
custom_coordinates = {
    "Domesnees": "57.75895, 22.60487",
    "Pest": "47.5, 19.08333",
    "Main":"49.7938, 9.62448",
    "Italien": "42.83333, 12.83333",
    "Türkische Grenze": "0, 0",
    "Alexandrien": "31.20176, 29.91582",
    "Schweiz": "47.00016, 8.01427",
    "Italienische Grenze": "0, 0",
    "Niederelbe": "53.78893, 9.42576",
    "Donau": "44.217771, 22.672659"
}

In [176]:
custom_coordinates = dict(zip(custom_coordinates.keys(),
     [np.array([float(c) for c in coords.split(', ')]) for coords in custom_coordinates.values()]))

In [177]:
missing_places = df_coords[df_coords.latitude.isna()].index

for place in missing_places:
    if place in custom_coordinates.keys():
        df_coords.loc[place, ['latitude', 'longitude']] = custom_coordinates[place]
    else:
        coords = insert_coordinates(place)
        df_coords.loc[place, ['latitude', 'longitude']] = coords
        custom_coordinates[place] = coords

Serbische Grenze
Insert coordinates: 0, 0
Spanische Grenze
Insert coordinates: 0, 0
Pera
Insert coordinates: 41.03694, 28.9775
Rhein
Insert coordinates: 50.075467, 7.773223
Sachsen
Insert coordinates: 51.118812, 12.937999
Kurland
Insert coordinates: 56.816781, 22.768799
Elberfeld
Insert coordinates: 51.25083, 7.14816
Sanssouci
Insert coordinates: 52.39886, 13.06566
Tuckum
Insert coordinates: 56.96764, 23.15554
Niederlanden
Insert coordinates: 52.25, 5.75
Helvoet
Insert coordinates: 51.83333, 4.13333
Cettinje
Insert coordinates: 42.39063, 18.91417
Leith
Insert coordinates: 55.9713, -3.1723
Livland
Insert coordinates: 57.627445, 26.132553
Ofen
Insert coordinates: 47.5, 19.03333
Oesterreich
Insert coordinates: 47.33333, 13.33333
Gravenhaag
Insert coordinates: 52.07667, 4.29861
Kissingen
Insert coordinates: 50.20228, 10.07784
Zarskoje-Sselo
Insert coordinates: 59.71417, 30.39642
Brandenburgischen
Insert coordinates: 52.45905, 13.01582
Texel
Insert coordinates: 53.08333, 4.83333
Tirnowo
Ins

In [183]:
df_coords.to_csv('../data/coordinates.tsv', sep='\t', encoding='utf8')

In [184]:
with open('../data/custom_coordinates.json', 'w', encoding='utf8') as f:
    json.dump(dict(zip(custom_coordinates.keys(), [list(c) for c in custom_coordinates.values()])), f)

In [145]:
df_coords[150:200]

Unnamed: 0,latitude,longitude
Erfurt,50.9787,11.03283
Gravesend,40.5976,-73.96514
Lyon,45.74846,4.84671
Spanische Grenze,,
Lüttich,50.63373,5.56749
Hanau,50.13423,8.91418
Helsingfors,60.16952,24.93545
Baden-Baden,48.7606,8.23975
Fulda,50.55162,9.67518
Pera,,
