In [128]:
import csv
import os
import pdb
import numpy as np
import pickle
import json

In [129]:
if not os.path.exists('./output'):
    os.makedirs('./output')

In [130]:
''' 
- Find all cities have flights and have state information, 
- according to `flight_routes_processed_v2.csv`, `regions.csv`, `airports.csv` and `airports.json` 
- (because for every city in our graph, we have to know its state and its airports). 
- Meanwhile find all airports.
'''

# flight_routes_processed_v2.csv, find all airports have flights, find all cities in it
all_cities_have_flight = set()
all_airports_in_flight = set()

airport_in_flight_to_city = dict()

with open('flight_routes_processed_v2.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
            
        all_cities_have_flight.add(line[3])  # From_City
        all_cities_have_flight.add(line[9])  # To_City
        
        all_airports_in_flight.add(line[2])  # From_Name
        all_airports_in_flight.add(line[8])  # To_Name
        
        airport_in_flight_to_city[line[2]] = line[3]
        airport_in_flight_to_city[line[8]] = line[9]
        
print('all_cities_have_flight: {}'.format(len(all_cities_have_flight)))
print('all_airports_in_flight: {}'.format(len(all_airports_in_flight)))
print(len(airport_in_flight_to_city))

# airports.csv, find all airports in it
all_airports_in_airportscsv = set()

with open('airports.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
            
        all_airports_in_airportscsv.add(line[3])
        
print('all_airports_in_airportscsv: {}'.format(len(all_airports_in_airportscsv)))


# airports.json, find all airports in it
all_airports_in_airportsjson = set()

with open('airports.json','r') as f:
    data = json.loads(f.read())
    for i in range(len(data)):
        all_airports_in_airportsjson.add(data[i]['name'])
        
print('all_airports_in_airportsjson: {}'.format(len(all_airports_in_airportsjson)))
        

all_airports_has_state_info = all_airports_in_airportscsv.union(all_airports_in_airportsjson)
print('all_airports have state information: {}'.format(len(all_airports_in_airportsjson)))

all_airports = all_airports_in_flight.intersection(all_airports_has_state_info)
print('all_airports in flights and have state information (all flights): {}'.format(len(all_airports)))

# all_cities are cities of all_airports in flights and have state information
all_cities = set()

for airport in all_airports:
    all_cities.add(airport_in_flight_to_city[airport])
print('all_cities: {}'.format(len(all_cities)))

# save all cities
with open('./output/all_cities.pickle', 'wb') as f:
    pickle.dump(all_cities, f)
    
# save all airports
with open('./output/all_airports.pickle', 'wb') as f:
    pickle.dump(all_airports, f)

all_cities_have_flight: 3136
all_airports_in_flight: 3244
3244
all_airports_in_airportscsv: 53037
all_airports_in_airportsjson: 3552
all_airports have state information: 3552
all_airports in flights and have state information (all flights): 3227
all_cities: 3114


# Check all cities

In [131]:
print("the number of all cities: {}".format(len(all_cities)))
print('Herat' in all_cities)
print('Beijing' in all_cities)
print('Newark' in all_cities)
print('New York' in all_cities)
print('Shanghai' in all_cities)
print('Washington' in all_cities)
print('Sanya' in all_cities)

the number of all cities: 3114
True
True
True
True
True
True
True


In [132]:
'''
- Generate `flight.txt` based on filtered current all cities and `flight_routes_processed_v2.csv`.
'''
flights = []
with open('flight_routes_processed_v2.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
            
        From_City = line[3]
        To_City = line[9]
        
        if (From_City in all_cities) and (To_City in all_cities):
            From_Name = line[2]
            To_Name = line[8]
            
            flights.append((From_Name, To_Name))
                       
# save flight.txt
with open('./output/flight.txt', 'w') as f:
    for edge in flights:
        (source, target) = edge
        f.write('{}\t{}\n'.format(source, target))
print("the number of all flights: {}".format(len(flights)))

the number of all flights: 66133


In [133]:
'''
- Find current all cities' train stations, ferry ports 
- respectively according to `airports-extended.dat`, `airports-extended.dat`. 
- Meanwhile find all train stations, ferry ports.
'''
"""
headers: Airport ID, Name, City, Country, IATA, ICAO, Latitude, Longitude, Altitude, Timezone, DST, Tz database time zone, Type, Source
"""
all_ferry = set()
all_train = set()
train2centriods = dict()
ferry2centriods = dict()
train2city = dict()
ferry2city = dict()

with open('airports-extended.dat', 'r') as f:
    lines = f.readlines()
    
    for line in lines:
        line = line.rstrip().split(',')
        for l_i in range(len(line)):
            if "\"" in line[l_i]:
                line[l_i] = line[l_i].split("\"")[1]

        city = line[2]
        
        if city in all_cities:

            if line[-2] in {"station"}:
                all_train.add(line[1])
                train2centriods[line[1]] = (line[6], line[7])
                train2city[line[1]] = city
            elif line[-2] in {"port"}:
                all_ferry.add(line[1])
                ferry2centriods[line[1]] = (line[6], line[7])
                ferry2city[line[1]] = city

print("the number of all train stations: {}".format(len(all_train)))
print("the number of all ferry ports: {}".format(len(all_ferry)))

# save all_train
with open('./output/all_train.pickle', 'wb') as f:
    pickle.dump(all_train, f)
    
# save all_ferry
with open('./output/all_ferry.pickle', 'wb') as f:
    pickle.dump(all_ferry, f)
    
# save train2centriods
with open('./output/train2centriods.pickle', 'wb') as f:
    pickle.dump(train2centriods, f)
    
# save ferry2centriods
with open('./output/ferry2centriods.pickle', 'wb') as f:
    pickle.dump(ferry2centriods, f)
    
# save train2city
with open('./output/train2city.pickle', 'wb') as f:
    pickle.dump(train2city, f)
    
# save ferry2city
with open('./output/ferry2city.pickle', 'wb') as f:
    pickle.dump(ferry2city, f)

the number of all train stations: 478
the number of all ferry ports: 34


In [134]:
'''
Meanwhile:
- generate airport2city, city2state, state2country, country2continent mapping
- airport2centriods, city2centriods, state2centriods, country2centriods
- city2population


- Find all states = {states of current all cities} + {all states in `JHU COVID-19 database`} 
- (source: `regions.csv`, `airports.json` and `UID_ISO_FIPS_LookUp_Table.csv`)

- Find all countries = {countries of current all states} + {all countries in `JHU COVID-19 database`} 
- (source: `regions.csv`, `airports.json` and `UID_ISO_FIPS_LookUp_Table.csv`).

- Find all continents = {continents of current all countries} 
- (source: `regions.csv` and `airports.json`).
'''

# mapping abbr_continent_to_continent_name
'''
 Allowed values are "AF" (Africa), "AN" (Antarctica), "AS" (Asia), "EU" (Europe), "NA" (North America), "OC" (Oceania), or "SA" (South America)
'''
abbr_continent_to_continent_name = {
    "AF": "Africa",
    "AN": "Antarctica",
    "AS": "Asia",
    "EU": "Europe",
    "NA": "North America",
    "OC": "Oceania",
    "SA": "South America"
}

# initailize 
'''
we already have:
- all_airports, all_train, all_ferry, all_cities,
- train2city, ferry2city,
- train2centriods, ferry2centriods
'''
all_states = set()
all_countries = set()
all_continents = set()

airport2city = dict()
city2state = dict()
state2country = dict()
country2continent = dict()

airport2centriods = dict()
city2centriods = dict()
state2centriods = dict()
country2centriods = dict()

city2population = dict()

for airport in all_airports:
    airport2city[airport] = 'NULL'
    airport2centriods[airport] = 'NULL'

for city in all_cities:
    city2state[city] = 'NULL'
    city2centriods[city] = 'NULL'
    city2population[city] = 'NULL'

# airport2city
for airport in all_airports:
    airport2city[airport] = airport_in_flight_to_city[airport]
    
# save airport2city
with open('./output/airport2city.pickle', 'wb') as f:
    pickle.dump(airport2city, f)
    

# mapping iso_region_to_region_name
iso_region_to_region_name = dict()
with open('regions.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
            
        iso_region_to_region_name[line[1]] = line[3]
        
# mapping iso_country_to_country_name
iso2_country_to_country_name = dict()

country2continent_ourflight = dict()
with open('countries.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
        
        country_name = line[2]
        country_iso2 = line[1]
        
        iso2_country_to_country_name[country_iso2] = country_name
        
        country2continent_ourflight[country_name] = abbr_continent_to_continent_name[line[3]]


with open('UID_ISO_FIPS_LookUp_Table.csv','rt') as f:  # JHU COVID-19 database
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
        
        country_name = line[7]
        country_iso2 = line[1]
        country_iso3 = line[2]  # not used
        
        iso2_country_to_country_name[country_iso2] = country_name

# all_continent, city2state, state2country, country2continent, airport2centriods 
with open('airports.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
        
        airport_name = line[3]
        
        if airport_name == 'New York Stewart International Airport':
            airport_name = 'Stewart International Airport'
            
        if airport_name == 'Naujaat Airport':
            airport_name = 'Repulse Bay Airport'
        
        if airport_name in all_airports:
            city_name = airport2city[airport_name]
        
            if city_name in all_cities:
                airport2centriods[airport_name] = (line[4], line[5])

                iso_region = line[9]
                state_name = iso_region_to_region_name[iso_region]
                city2state[city_name] = state_name

                iso_country = line[8]
                country_name = iso2_country_to_country_name[iso_country]
                state2country[state_name] = country_name
                country2continent[country_name] = 'NULL'

                abbr_continent = line[7]
                continent_name = abbr_continent_to_continent_name[abbr_continent]
                country2continent[country_name] = continent_name
                all_continents.add(continent_name)

            
# save all_continent
print("the number of all continents that have country: {}".format(len(all_continents)))
print("all continents that have country: {}".format(all_continents))
all_continents = set(abbr_continent_to_continent_name.values())
with open('./output/all_continents.pickle', 'wb') as f:
    pickle.dump(all_continents, f)
print("the number of all continents: {}".format(len(all_continents)))
    

all_states = set(state2country.keys())
all_countries = set(country2continent.keys())


# update missing city2state, state2country, airport2centriods 
with open('airports.json','r') as f:
    data = json.loads(f.read())
    for i in range(len(data)):
        airport_name = data[i]['name']
        city_name = data[i]['city']
        state_name = data[i]['state']
        country_name = data[i]['country']
        
        if city_name in all_cities:
            
            if city2state[city_name] == 'NULL':
                city2state[city_name] = state_name
                if state_name not in all_states:
                    state2country[state_name] = 'NULL'
                
            try:
                tem = state2country[state_name]
            except KeyError:
                state2country[state_name] = country_name
                if country_name not in all_countries:
                    country2continent[country_name] = 'NULL'

            if airport_name in all_airports:
                if airport2centriods[airport_name] == 'NULL':
                    airport2centriods[airport_name] = (data[i]['lat'], data[i]['lon'])


# overwrite exisiting and update missing airport2centriods               
with open('flight_routes_processed_v2.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
            
        From_City = line[3]
        To_City = line[9]
        From_Name = line[2]
        To_Name = line[8]
        
        if From_Name in all_airports:
            if From_City in all_cities:
                airport2centriods[From_Name] = (line[5], line[6])     
                
        if To_Name in all_airports:
            if To_City in all_cities:
                airport2centriods[To_Name] = (line[11], line[12]) 
            
# save airport2centriods
with open('./output/airport2centriods.pickle', 'wb') as f:
    pickle.dump(airport2centriods, f)

    
# generate city2centriods, city2population
with open('worldcities_processed.csv','rt') as f:
    data = list(csv.reader(f))
    for i in range(1, len(data)):
        line = data[i]
        
        city_name = line[0]
        
        if city_name in all_cities:
            if city2centriods[city_name] == 'NULL':
                city2centriods[city_name] = (line[1], line[2])
            if city2population[city_name] == 'NULL':
                city2population[city_name] = line[-1]
        
with open('./output/city2centriods.pickle', 'wb') as f:
    pickle.dump(city2centriods, f)
with open('./output/city2population.pickle', 'wb') as f:
    pickle.dump(city2population, f)
    

all_states = set(state2country.keys())
all_countries = set(country2continent.keys())

# overwrite exisiting and update missing state2country, country2continent, generate state2centriods, country2centriods
with open('UID_ISO_FIPS_LookUp_Table.csv','rt') as f:
    data = list(csv.reader(f))
    # Header:  ['\ufeffUID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key']
    
    for i in range(1, len(data)):
        line = data[i]
        
        state_name = line[6]
        country_name = line[7]
        
        centriods = (line[8], line[9])
            
        if not state_name == '':  # Province_State NOT empty
            state2country[state_name] = country_name
            state2centriods[state_name] = centriods
            
            if country_name not in all_countries:
                country2continent[country_name] = 'NULL'

        else:  # Province_State IS empty
            country2centriods[country_name] = centriods
            
            if country_name not in all_countries:
                country2continent[country_name] = 'NULL'
            
            
# save city2state
# for city in all_cities:
#     if city2state[city] == '':
#         pdb.set_trace()

with open('./output/city2state.pickle', 'wb') as f:
    pickle.dump(city2state, f)
    
    
# all_states, all_countries
del state2country['']

for state in state2country:
    if state2country[state] == 'United States':
        state2country[state] = 'US'
        
del country2continent['United States']

all_states = set(state2country.keys())
all_countries = set(country2continent.keys())

with open('./output/all_states.pickle', 'wb') as f:
    pickle.dump(all_states, f)
with open('./output/all_countries.pickle', 'wb') as f:
    pickle.dump(all_countries, f)

with open('./output/state2country.pickle', 'wb') as f:
    pickle.dump(state2country, f)
    
    
for state in all_states:
    try:
        tem = state2centriods[state]
    except KeyError:
        state2centriods[state] = 'NULL'
        
for country in all_countries:
    try:
        tem = country2centriods[country]
    except KeyError:
        country2centriods[country] = 'NULL'
        
with open('./output/state2centriods.pickle', 'wb') as f:
    pickle.dump(state2centriods, f)
with open('./output/country2centriods.pickle', 'wb') as f:
    pickle.dump(country2centriods, f)
    
    
for country in country2continent:
    if country2continent[country] == 'NULL':
        try:
            country2continent[country] = country2continent_ourflight[country] 
        except KeyError:
            pass
        
with open('./output/country2continent.pickle', 'wb') as f:
    pickle.dump(country2continent, f)
    
print("the number of all states: {}".format(len(all_states)))
print("the number of all countries: {}".format(len(all_countries)))
print(len(state2country))
print(len(country2continent))
print(len(state2centriods))
print(len(country2centriods))

the number of all continents that have country: 6
all continents that have country: {'Oceania', 'Europe', 'Africa', 'South America', 'Asia', 'North America'}
the number of all continents: 7
the number of all states: 1955
the number of all countries: 231
1955
231
1955
231


# Check all states

In [135]:
print("the number of all states: {}".format(len(all_states)))
print('Anhui' in all_states)
print('Beijing' in all_states)
print('Hubei' in all_states)
print('New York' in all_states)
print('New Jersey' in all_states)
print('Washington' in all_states)
print('California' in all_states)

the number of all states: 1955
True
True
True
True
True
True
True


# Check all countries

In [136]:
print("the number of all countries: {}".format(len(all_countries)))
print('US' in all_countries)
print('United States' in all_countries)  # should be False
print('China' in all_countries)
print('Italy' in all_countries)
print('United Kingdom' in all_countries)
print('Korea, South' in all_countries)
print('Japan' in all_countries)
print('Singapore' in all_countries)

the number of all countries: 231
True
False
True
True
True
True
True
True


# Final Check bottom-up

In [137]:
print('\nairport')
print(len(all_airports)) 
print(len(airport2city))
print(len(airport2centriods))
null_parent = 0
null_centriods = 0
for airport in all_airports:
    if airport2city[airport] == 'NULL':
        null_parent += 1
    if airport2centriods[airport] == 'NULL':
        null_centriods += 1    
print(null_parent)   
print(null_centriods)   


print('\ntrain')
print(len(all_train)) 
print(len(train2city))
print(len(train2centriods))
null_parent = 0
null_centriods = 0
for train in all_train:
    if train2city[train] == 'NULL':
        null_parent += 1
    if train2centriods[train] == 'NULL':
        null_centriods += 1    
print(null_parent)   
print(null_centriods)   


print('\nferry')
print(len(all_ferry)) 
print(len(ferry2city))
print(len(ferry2centriods))
null_parent = 0
null_centriods = 0
for ferry in all_ferry:
    if ferry2city[ferry] == 'NULL':
        null_parent += 1
    if ferry2centriods[ferry] == 'NULL':
        null_centriods += 1    
print(null_parent)   
print(null_centriods)   


print('\ncity')
print(len(all_cities)) 
print(len(city2state))
print(len(city2centriods))
print(len(city2population))
null_parent = 0
null_centriods = 0
null_population = 0
for city in all_cities:
    if city2state[city] == 'NULL':
        null_parent += 1
    if city2centriods[city] == 'NULL':
        null_centriods += 1  
    if city2population[city] == 'NULL':
        null_population += 1  
print(null_parent)   
print(null_centriods) 
print(null_population) 
# because city centriods and population are only from worldcities_processed

print('\nstate')
print(len(all_states)) 
print(len(state2country))
print(len(state2centriods))
null_parent = 0
null_centriods = 0
for state in all_states:
    if state2country[state] == 'NULL':
        null_parent += 1
print(null_parent)  
for state in state2centriods:
    if state2centriods[state] == 'NULL':
        null_centriods += 1   
print(null_centriods)   


print('\ncountry')
print(len(all_countries)) 
print(len(country2continent))
print(len(country2centriods))
null_parent = 0
null_centriods = 0
for country in all_countries:
    if country2continent[country] == 'NULL':
        null_parent += 1
print(null_parent)  
for country in country2centriods:
    if country2centriods[country] == 'NULL':
        null_centriods += 1   
print(null_centriods)   

print('\ncontinent')
print(len(all_continents)) 
print(all_continents)


airport
3227
3227
3227
0
0

train
478
478
478
0
0

ferry
34
34
34
0
0

city
3114
3114
3114
3114
0
1460
1460

state
1955
1955
1955
0
1820

country
231
231
231
15
58

continent
7
{'Oceania', 'Europe', 'Africa', 'South America', 'Asia', 'Antarctica', 'North America'}


# Final Check Top-down

In [138]:
# all country has chidren?
print('\ncountry')
print(len(all_countries)) 
p2c = dict()
for p in all_countries:
    p2c[p] = set()

for state in state2country:
    p2c[state2country[state]].add(state)
    
null_child = 0
for p in all_countries:
    if len(p2c[p]) == 0:
        null_child += 1
print(null_child) 

# all state has chidren?
print('\nstate')
print(len(all_states)) 
p2c = dict()
for p in all_states:
    p2c[p] = set()

for city in city2state:
    p2c[city2state[city]].add(city)
    
null_child = 0
for p in all_states:
    if len(p2c[p]) == 0:
        null_child += 1
print(null_child) 

# all city has airport?
print('\ncity airport')
print(len(all_cities)) 
p2c = dict()
for p in all_cities:
    p2c[p] = set()

for airport in airport2city:
    p2c[airport2city[airport]].add(airport)
    
null_child = 0
for p in all_cities:
    if len(p2c[p]) == 0:
        null_child += 1
print(null_child) 

# all city has train?
print('\ncity train')
print(len(all_cities)) 
p2c = dict()
for p in all_cities:
    p2c[p] = set()

for train in train2city:
    p2c[train2city[train]].add(train)
    
null_child = 0
for p in all_cities:
    if len(p2c[p]) == 0:
        null_child += 1
print(null_child) 


# all city has ferry?
print('\ncity ferry')
print(len(all_cities)) 
p2c = dict()
for p in all_cities:
    p2c[p] = set()

for ferry in ferry2city:
    p2c[ferry2city[ferry]].add(ferry)
    
null_child = 0
for p in all_cities:
    if len(p2c[p]) == 0:
        null_child += 1
print(null_child) 


country
231
24

state
1955
612

city airport
3114
0

city train
3114
2879

city ferry
3114
3086
