In [1]:
import pandas as pd
import chardet
import glob
import os
all_files = glob.glob(os.path.join("loc212csv", "2021-2 UNLOCODE CodeListPart*.csv"))
all_files

['loc212csv/2021-2 UNLOCODE CodeListPart1.csv',
 'loc212csv/2021-2 UNLOCODE CodeListPart3.csv',
 'loc212csv/2021-2 UNLOCODE CodeListPart2.csv']

In [2]:
dfs = []
for filename in all_files:
    with open(filename, 'rb') as f:
        result = chardet.detect(f.read())
        print(filename, result["encoding"])
        dfs.append(pd.read_csv(filename, encoding=result["encoding"], header=None, keep_default_na=False))

loc212csv/2021-2 UNLOCODE CodeListPart1.csv ISO-8859-1
loc212csv/2021-2 UNLOCODE CodeListPart3.csv ISO-8859-1
loc212csv/2021-2 UNLOCODE CodeListPart2.csv Windows-1252


In [3]:
df = pd.concat(dfs)
df.columns = ["Ch", "Country Code", "Location Code", "Name", "NameWoDiacritics", "SubDiv", "Function", "Status", "Date", "IATA", "Coordinates", "Remarks"]
df = df[df["Location Code"] != ""] # Remove row if location code is empty
df

Unnamed: 0,Ch,Country Code,Location Code,Name,NameWoDiacritics,SubDiv,Function,Status,Date,IATA,Coordinates,Remarks
1,,AD,ALV,Andorra la Vella,Andorra la Vella,,--34-6--,AI,0601,,4230N 00131E,
2,,AD,CAN,Canillo,Canillo,,--3-----,RL,0307,,4234N 00135E,
3,,AD,ENC,Encamp,Encamp,,--3-----,RL,0307,,4232N 00134E,
4,,AD,ESC,Escaldes-Engordany,Escaldes-Engordany,,--3-----,RL,0307,,4231N 00133E,
5,,AD,EAC,Escàs,Escas,04,--3-----,RL,1407,,4233N 00131E,
...,...,...,...,...,...,...,...,...,...,...,...,...
27449,,NZ,WTZ,Whitianga,Whitianga,WKO,---4----,AI,0212,,3650S 17542E,
27450,,NZ,WNT,Winton,Winton,STL,--3-----,RQ,0607,,4608S 16819E,
27451,,NZ,WII,Wiri,Wiri,AUK,--3-----,RQ,8909,,,
27452,,NZ,WLS,Woolston,Woolston,CAN,--3-----,RL,1301,,4333S 17241E,


In [4]:
# Preprocessing
df["code"] = df["Country Code"] + df["Location Code"]
df = df.rename(columns={
    "Name": "name",
    "Function": "function"
})
df = df[["name", "code", "function"]]
df

Unnamed: 0,name,code,function
1,Andorra la Vella,ADALV,--34-6--
2,Canillo,ADCAN,--3-----
3,Encamp,ADENC,--3-----
4,Escaldes-Engordany,ADESC,--3-----
5,Escàs,ADEAC,--3-----
...,...,...,...
27449,Whitianga,NZWTZ,---4----
27450,Winton,NZWNT,--3-----
27451,Wiri,NZWII,--3-----
27452,Woolston,NZWLS,--3-----


In [5]:
source_df_sea = df[df["function"].str.contains("1")]
source_df_rail = df[df["function"].str.contains("2")]
source_df_road = df[df["function"].str.contains("3")]
source_df_air = df[df["function"].str.contains("4")]
source_df_dry = df[df["function"].str.contains("6")]

In [6]:
df_air = pd.read_json("old/airports.json")
df_air

Unnamed: 0,name,code
0,Andorra la Vella,ADALV
1,Abu Dhabi,AEAUH
2,Al Ain,AEAAN
3,Al Dhafra,AEDHF
4,Al Fujayrah,AEFJR
...,...,...
8869,Kariba,ZWKAB
8870,Mahenye,ZWMJW
8871,Masvingo,ZWMVZ
8872,Mutare,ZWUTA


In [7]:
df_sea = pd.read_json("old/seaports.json")
df_sea

Unnamed: 0,name,code
0,Abu al Bukhoosh,AEABU
1,Abu Dhabi,AEAUH
2,Abu Musa,AEAMU
3,Ahmed Bin Rashid Port,AEARP
4,Ajman,AEAJM
...,...,...
18504,Chegutu,ZWCHE
18505,Filabusi,ZWFLU
18506,Gwanda,ZWGWA
18507,Rusape,ZWRSP


In [8]:
import re
from tqdm.notebook import tqdm

def process(old_df, new_df, function):
    print("Before", function, len(old_df))

    with tqdm(total=new_df.shape[0]) as bar:
        for idx, row in new_df.iterrows():
            bar.update(1)
            if len(old_df[old_df["code"].eq(row["code"])]) == 0:
                old_df.loc[len(old_df)] = [row["name"], row["code"]]

    old_df = old_df.sort_values(by=['code'])
    print("After", function, len(old_df))
    print("--------------")
    return old_df

new_df_sea = process(df_sea.copy(), source_df_sea, "Sea")
new_df_rail = process(pd.DataFrame(columns=["name", "code"]), source_df_rail, "Rail")
new_df_road = process(pd.DataFrame(columns=["name", "code"]), source_df_road, "Road")
new_df_air = process(df_air.copy(), source_df_air, "Air")
new_df_dry = process(pd.DataFrame(columns=["name", "code"]), source_df_dry, "Dry")

Before Sea 18509


  0%|          | 0/17431 [00:00<?, ?it/s]

After Sea 18715
--------------
Before Rail 0


  0%|          | 0/13101 [00:00<?, ?it/s]

After Rail 13079
--------------
Before Road 0


  0%|          | 0/89670 [00:00<?, ?it/s]

After Road 89597
--------------
Before Air 8874


  0%|          | 0/9109 [00:00<?, ?it/s]

After Air 9198
--------------
Before Dry 0


  0%|          | 0/15112 [00:00<?, ?it/s]

After Dry 15110
--------------


In [9]:
# Validate some data
assert len(new_df_air[new_df_air["code"].eq("ADALV")]) > 0
assert len(new_df_air[new_df_air["code"].eq("BRACU")]) > 0
assert len(new_df_road[new_df_road["code"].eq("ADCAN")]) > 0
assert len(new_df_road[new_df_road["code"].eq("ADENC")]) > 0

In [10]:
new_df_sea.to_json("new/seaports.json", orient='records')
new_df_rail.to_json("new/rails.json", orient='records')
new_df_road.to_json("new/roads.json", orient='records')
new_df_air.to_json("new/airports.json", orient='records')
new_df_dry.to_json("new/dryports.json", orient='records')