In [1]:
import pandas as pd
import chardet
import glob
import os
all_files = glob.glob(os.path.join("loc212csv", "2021-2 UNLOCODE CodeListPart*.csv"))
all_files

['loc212csv/2021-2 UNLOCODE CodeListPart1.csv',
 'loc212csv/2021-2 UNLOCODE CodeListPart3.csv',
 'loc212csv/2021-2 UNLOCODE CodeListPart2.csv']

In [2]:
dfs = []
for filename in all_files:
    with open(filename, 'rb') as f:
        result = chardet.detect(f.read())
        print(filename, result["encoding"])
        dfs.append(pd.read_csv(filename, encoding=result["encoding"], header=None, keep_default_na=False))

loc212csv/2021-2 UNLOCODE CodeListPart1.csv ISO-8859-1
loc212csv/2021-2 UNLOCODE CodeListPart3.csv ISO-8859-1
loc212csv/2021-2 UNLOCODE CodeListPart2.csv Windows-1252


In [3]:
df = pd.concat(dfs)
df.columns = ["Ch", "Country Code", "Location Code", "Name", "NameWoDiacritics", "SubDiv", "Function", "Status", "Date", "IATA", "Coordinates", "Remarks"]
df = df.dropna(subset=["Location Code"]) # Remove unused rows

In [4]:
df

Unnamed: 0,Ch,Country Code,Location Code,Name,NameWoDiacritics,SubDiv,Function,Status,Date,IATA,Coordinates,Remarks
0,,AD,,.ANDORRA,,,,,,,,
1,,AD,ALV,Andorra la Vella,Andorra la Vella,,--34-6--,AI,0601,,4230N 00131E,
2,,AD,CAN,Canillo,Canillo,,--3-----,RL,0307,,4234N 00135E,
3,,AD,ENC,Encamp,Encamp,,--3-----,RL,0307,,4232N 00134E,
4,,AD,ESC,Escaldes-Engordany,Escaldes-Engordany,,--3-----,RL,0307,,4231N 00133E,
...,...,...,...,...,...,...,...,...,...,...,...,...
27449,,NZ,WTZ,Whitianga,Whitianga,WKO,---4----,AI,0212,,3650S 17542E,
27450,,NZ,WNT,Winton,Winton,STL,--3-----,RQ,0607,,4608S 16819E,
27451,,NZ,WII,Wiri,Wiri,AUK,--3-----,RQ,8909,,,
27452,,NZ,WLS,Woolston,Woolston,CAN,--3-----,RL,1301,,4333S 17241E,


In [5]:
def get_function_name(function_number):
    return {
        "1": "Sea",
        "2": "Rail",
        "3": "Road",
        "4": "Air",
        "6": "Dry"
    }.get(function_number)

In [6]:
df_air = pd.read_json("airports.json")
if not "function" in df_air.columns:
    df_air["function"] = "Air"
df_air

Unnamed: 0,name,code,function
0,Andorra la Vella,ADALV,Air
1,Abu Dhabi,AEAUH,Air
2,Al Ain,AEAAN,Air
3,Al Dhafra,AEDHF,Air
4,Al Fujayrah,AEFJR,Air
...,...,...,...
8869,Kariba,ZWKAB,Air
8870,Mahenye,ZWMJW,Air
8871,Masvingo,ZWMVZ,Air
8872,Mutare,ZWUTA,Air


In [7]:
df_sea = pd.read_json("seaports.json")
if not "function" in df_sea.columns:
    df_sea["function"] = "Sea"
df_sea

Unnamed: 0,name,code,function
0,Abu al Bukhoosh,AEABU,Sea
1,Abu Dhabi,AEAUH,Sea
2,Abu Musa,AEAMU,Sea
3,Ahmed Bin Rashid Port,AEARP,Sea
4,Ajman,AEAJM,Sea
...,...,...,...
18504,Chegutu,ZWCHE,Sea
18505,Filabusi,ZWFLU,Sea
18506,Gwanda,ZWGWA,Sea
18507,Rusape,ZWRSP,Sea


In [8]:
import re
from tqdm.notebook import tqdm

print("Before Air + Land", len(df_air))
print("Before Sea + Land", len(df_sea))
    
with tqdm(total=df.shape[0]) as bar:
    for idx, row in df.iterrows():
        bar.update(1)
        functions = re.findall(r'\d', row["Function"])
        functions = list(filter(None, map(get_function_name, functions)))
        code = row["Country Code"] + row["Location Code"]
        for function in functions:
            if function != "Sea" and len(df_air[df_air["code"].eq(code) & df_air["function"].eq(function)]) == 0:
                df_air.loc[len(df_air)] = [row["Name"], code, function]
            if function != "Air" and len(df_sea[df_sea["code"].eq(code) & df_sea["function"].eq(function)]) == 0:
                df_sea.loc[len(df_sea)] = [row["Name"], code, function]

print("After Air + Land", len(df_air))
print("After Sea + Land", len(df_sea))

Before Air + Land 8874
Before Sea + Land 18509


  0%|          | 0/115989 [00:00<?, ?it/s]

After Air + Land 126985
After Sea + Land 136502


In [9]:
df_air = df_air.sort_values(by=['code'])
df_sea = df_sea.sort_values(by=['code'])

In [10]:
# Validate some data
assert len(df_air[df_air["code"].eq("ADALV")]) == 3
assert len(df_air[df_air["code"].eq("ADCAN")]) == 1
assert len(df_air[df_air["code"].eq("ADENC")]) == 1
assert len(df_air[df_air["code"].eq("AMEVN")]) == 3
assert len(df_air[df_air["code"].eq("ARBUE")]) == 4
assert len(df_air[df_air["code"].eq("ATKTT")]) == 4
df_air[df_air["code"].eq("ADALV") | df_air["code"].eq("ADCAN") | df_air["code"].eq("ADENC")]

Unnamed: 0,name,code,function
0,Andorra la Vella,ADALV,Air
8875,Andorra la Vella,ADALV,Dry
8874,Andorra la Vella,ADALV,Road
8876,Canillo,ADCAN,Road
8877,Encamp,ADENC,Road


In [14]:
# TODO: save to new json
df_air.to_json("new_airports.json", orient='records')
df_sea.to_json("new_seaports.json", orient='records')