In [17]:
import pandas as pd

cols = ["id", "city", "country", "IATA", "ICAO", "latitude", "longitude", "altitude", "timezone", "DST", "Tz database time zone", "type", "source"]
airports = pd.read_csv('data/airports.csv', index_col=0, names=cols, header=None)
print(len(airports))

subset = ["IATA", "latitude", "longitude", "country", "city"]
airports = airports.dropna(subset=subset)
for col in subset:
    airports = airports[airports[col] != "\\N"]

print(len(airports))
airports

7698
6033


Unnamed: 0,id,city,country,IATA,ICAO,latitude,longitude,altitude,timezone,DST,Tz database time zone,type,source
1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.081690,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.207080,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826790,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.443380,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14097,Bilogai-Sugapa Airport,Sugapa-Papua Island,Indonesia,UGU,WABV,-3.739560,137.031998,7348,\N,\N,\N,airport,OurAirports
14100,Ramon Airport,Eilat,Israel,ETM,LLER,29.723694,35.011416,288,\N,\N,\N,airport,OurAirports
14101,Rustaq Airport,Al Masna'ah,Oman,MNH,OORQ,23.640556,57.487500,349,\N,\N,\N,airport,OurAirports
14102,Laguindingan Airport,Cagayan de Oro City,Philippines,CGY,RPMY,8.612203,124.456496,190,\N,\N,\N,airport,OurAirports


In [18]:
continents = pd.read_csv('data/continents.csv')
## make a dictionary of country to continent
country_to_continent = {}
for index, row in continents.iterrows():
    country = row['Entity']
    country_to_continent[row['Entity']] = row['Continent']
    
## show the first 5 entries of the dictionary
dict(list(country_to_continent.items())[0:5])

{'Abkhazia': 'Asia',
 'Afghanistan': 'Asia',
 'Akrotiri and Dhekelia': 'Asia',
 'Aland Islands': 'Europe',
 'Albania': 'Europe'}

In [19]:
countries_from_airports = airports['country'].unique()
countries_from_continents = continents['Entity'].unique()

## find the countries that are in airports but not in continents
for country in countries_from_airports:
    if country not in countries_from_continents:
        print(country)
        
def fix_country_name(name : str) -> str:
    """
    Function for matching up the country names in the airports dataset with the country names in the continents dataset.
    """
    
    if name[:5] == "Congo":
        return "Congo"
    
    if name == "Czech Republic":
        return "Czechia"
    
    if name == "Macedonia":
        return "North Macedonia"
    
    if name == "Swaziland":
        return "Eswatini"
    
    if name == "Micronesia":
        return "Micronesia (country)"
    
    if name == "Burma":
        return "Myanmar"
    
    if name in ["Midway Islands", "US Virgin Islands", "Virgin Islands", "Johnston Atoll", "Wake Island"]:
        return "United States"
    
    if name == "Macau":
        return "China"
    
    if name == "Cocos (Keeling) Islands":
        return "Australia"
    
    if name == "Russian Federation":
        return "Russia"
    
    return name

Congo (Brazzaville)
Congo (Kinshasa)
Swaziland
Czech Republic
Macedonia
Midway Islands
Micronesia
Virgin Islands
Macau
Burma
Johnston Atoll
Cocos (Keeling) Islands
Wake Island


In [20]:
reviews = pd.read_csv('data/airportreviews.csv')
## filter out content and countries that is NaN
reviews = reviews.dropna(subset=['content', 'author_country'])
## make a new column for the continent of the author
reviews['author_continent'] = reviews['author_country'].apply(lambda x: country_to_continent[fix_country_name(x)])

## make a dictionary mapping continent to reviews
continent_to_reviews = {}
for continent in reviews['author_continent'].unique():
    continent_reviews = reviews[reviews['author_continent'] == continent]['content'].tolist()
    continent_to_reviews[continent] = continent_reviews
    
country_to_reviews = {}
for country in reviews['author_country'].unique():
    country_reviews = reviews[reviews['author_country'] == country]['content'].tolist()
    country_to_reviews[country] = country_reviews

# save the dictionary to a file
import pickle
with open('data/continent_to_reviews.pkl', 'wb') as f:
    pickle.dump(continent_to_reviews, f)
    
with open('data/country_to_reviews.pkl', 'wb') as f:
    pickle.dump(country_to_reviews, f)

In [23]:
## make a dictionary of IATA codes
## IATA code is a three-letter code designating many airports around the world
## for each airport, safe relevant information in a dictionary

IATA = {}
for index, row in airports.iterrows():
    country = row['country']
    
    IATA[row['IATA']] = {
        'city': row['city'],                                                ## city
        'country': country,                                          ## country
        'latitude': row['latitude'],                                        ## latitude
        'longitude': row['longitude'],                                      ## longitude
        'name': row['id'],                                                  ## name
        'continent': country_to_continent.get(fix_country_name(country), 'unknown')    ## continent
    }
    
## save the dictionary to a file
import json
with open('data/IATA.json', 'w') as f:
    json.dump(IATA, f)

## show the first 5 entries of the dictionary
for i, (k, v) in enumerate(IATA.items()):
    print(k, v)
    if i == 5:
        break
print("...")
    
## count how many airports are in each continent
continent_count = {}
for k, v in IATA.items():
    continent = v['continent']

    if continent not in continent_count:
        continent_count[continent] = 0
    continent_count[continent] += 1
    
print("Number of airports in each continent:")
print(continent_count)

GKA {'city': 'Goroka', 'country': 'Papua New Guinea', 'latitude': -6.081689834590001, 'longitude': 145.391998291, 'name': 'Goroka Airport', 'continent': 'Oceania'}
MAG {'city': 'Madang', 'country': 'Papua New Guinea', 'latitude': -5.20707988739, 'longitude': 145.789001465, 'name': 'Madang Airport', 'continent': 'Oceania'}
HGU {'city': 'Mount Hagen', 'country': 'Papua New Guinea', 'latitude': -5.826789855957031, 'longitude': 144.29600524902344, 'name': 'Mount Hagen Kagamuga Airport', 'continent': 'Oceania'}
LAE {'city': 'Nadzab', 'country': 'Papua New Guinea', 'latitude': -6.569803, 'longitude': 146.725977, 'name': 'Nadzab Airport', 'continent': 'Oceania'}
POM {'city': 'Port Moresby', 'country': 'Papua New Guinea', 'latitude': -9.44338035583496, 'longitude': 147.22000122070312, 'name': 'Port Moresby Jacksons International Airport', 'continent': 'Oceania'}
WWK {'city': 'Wewak', 'country': 'Papua New Guinea', 'latitude': -3.58383011818, 'longitude': 143.669006348, 'name': 'Wewak Internati

In [24]:
cols = ["airline", "airline id", "source airport", "source airport id", "destination airport", "destination airport id", "codeshare", "stops", "equipment"]
flights = pd.read_csv('data/flights.csv', names=cols, header=None)
print(len(flights))
flights = flights.dropna(subset=['source airport', 'destination airport'])
flights = flights[flights['stops'] == 0]  ## only direct flights
print(len(flights))
flights

67663
67652


Unnamed: 0,airline,airline id,source airport,source airport id,destination airport,destination airport id,codeshare,stops,equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2
...,...,...,...,...,...,...,...,...,...
67658,ZL,4178,WYA,6334,ADL,3341,,0,SF3
67659,ZM,19016,DME,4029,FRU,2912,,0,734
67660,ZM,19016,FRU,2912,DME,4029,,0,734
67661,ZM,19016,FRU,2912,OSS,2913,,0,734


In [25]:
## makes a graph of the flights
## the graph is a directed graph since the flights are one way
## nodes are cities, and edges are flights between cities
## the weight of the edge is the number of flights between the cities

import networkx as nx
from tqdm import tqdm
G = nx.DiGraph()

print("Making nodes with attributes..")
for iata, data in tqdm(IATA.items()):
    G.add_node(
        iata, 
        city=data['city'], 
        country=data['country'], 
        latitude=data['latitude'], 
        longitude=data['longitude'], 
        continent=data['continent'],
        name=data['name'],
        
        group=data['continent']
        )

print("Making edges..")
for index, row in tqdm(flights.iterrows(), total=flights.shape[0]):
    source = row['source airport']
    dest = row['destination airport']
    if source in IATA.keys() and dest in IATA.keys():
        if G.has_edge(source, dest):
            G[source][dest]['weight'] += 1
        else:
            G.add_edge(source, dest, weight=1)
            
## remove nodes with no edges
G.remove_nodes_from(list(nx.isolates(G)))

## 
import pickle
with open('data/graphnetwork.gpickle', 'wb') as f:
    pickle.dump(G, f, pickle.HIGHEST_PROTOCOL)

print("Number of nodes: ", G.number_of_nodes())
print("Number of edges: ", G.number_of_edges())

Making nodes with attributes..


100%|██████████| 6033/6033 [00:00<00:00, 95831.95it/s]


Making edges..


100%|██████████| 67652/67652 [00:06<00:00, 10107.14it/s]

Number of nodes:  3256
Number of edges:  37038



