In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy

def haversine(lon1, lat1, lon2, lat2):
    # Convert coordinates to floats.
    lon1, lat1, lon2, lat2 = [float(lon1), float(lat1), float(lon2), float(lat2)]
    # Convert to radians from degrees.
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    # Compute distance.
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 
    km = 6367 * c
    return km

def calc_dist(row):
    dist = 0
    try:
        # Match source and destination to get coordinates.
        source = airports[airports["id"] == row["source_id"]].iloc[0]
        dest = airports[airports["id"] == row["dest_id"]].iloc[0]
        # Use coordinates to compute distance.
        dist = haversine(dest["longitude"], dest["latitude"], source["longitude"], source["latitude"])
    except (ValueError, IndexError):
        pass
    return dist

def lookup_name(row):
    try:
        # Match the row id to the id in the airlines dataframe so we can get the name.
        name = airlines["name"][airlines["id"] == row["id"]].iloc[0]
    except (ValueError, IndexError):
        name = ""
    return name

# Read in the airports data.
airports = pd.read_csv("airports.dat", header=None, dtype=str)
airports.columns = ["id", "name", "city", "country", "code", "icao", "latitude",
                    "longitude", "altitude", "offset", "dst", "timezone", "type", "source"]

# Read in the airlines data.
airlines = pd.read_csv("airlines.dat", header=None, dtype=str)
airlines.columns = ["id", "name", "alias", "iata", "icao", "callsign", "country", "active"]

# Read in the routes data.
routes = pd.read_csv("routes.dat", header=None, dtype=str)
routes.columns = ["airline", "airline_id", "source", "source_id", "dest", "dest_id", "codeshare", 
                  "stops", "equipment"]

# Retains only rows in 'routes" where the "airline_id" is not "\N"
routes = routes[routes["airline_id"] != "\\N"]

route_lengths = routes.apply(calc_dist, axis=1)

# Put relevant columns into a dataframe.
route_length_df = pd.DataFrame({"length": route_lengths, "id": routes["airline_id"]})

# Compute the mean route length per airline.
airline_route_lengths = route_length_df.groupby("id").aggregate(numpy.mean)

# Sort by length so we can make a better chart.
airline_route_lengths = airline_route_lengths.sort_values("length", ascending=False)

# Add the index (the airline ids) as a column.
airline_route_lengths["id"] = airline_route_lengths.index.copy()

# Find all the airline names.
airline_route_lengths["name"] = airline_route_lengths.apply(lookup_name, axis=1)

# Remove duplicate values in the index.
airline_route_lengths.index = range(airline_route_lengths.shape[0])

# BIN THE ROUTES INTO LONG, MED AND SHORT DISTANCES
long_routes = len([k for k in route_lengths if k > 10000]) / len(route_lengths)
medium_routes = len([k for k in route_lengths if k < 10000 and k > 2000]) / len(route_lengths)
short_routes = len([k for k in route_lengths if k < 2000]) / len(route_lengths)

print route_lengths[0:5]

0    1505.879589
1    1039.785086
2     447.883531
3     770.024740
4    1337.791014
dtype: float64


In [1]:
# Using Matplotlib to generate a histogram

#plt.hist(route_lengths, bins=20)

In [2]:
# Sorted bar chart of route lengths

#mp.pyplot.bar(range(airline_route_lengths.shape[0]), airline_route_lengths["length"])

In [16]:
foo = pd.read_csv('./foo.dat', dtype=str)

print foo
print "--"

foo = foo[foo["airline_id"] != "\\N"]
print foo

   airline airline_id source source_id dest dest_id codeshare stops  equipment
0       3G        595    LOS       273  ROB    1063       NaN     0      319^M
1       3G        595    OXB      5665  CKY    4162       NaN     0      319^M
2       3G        595    OXB      5665  DKR    1084       NaN     0      319^M
3       3G        595    ROB      1063  ACC     248       NaN     0      319^M
4       3G        595    ROB      1063  FNA    1059       NaN     0      319^M
5       3G        595    ROB      1063  LOS     273       NaN     0      319^M
6       3H         \N    AKV      5506  YIK    5504       NaN     0      DH8^M
7       3H         \N    AKV      5506  YPX    6727       NaN     0      DH8^M
8       3H         \N    XGR      5512  YVP     154       NaN     0  DHT BET^M
9       3H         \N    YGL        62  YGW    5496       NaN     0      DHT^M
10      3H         \N    YGL        62  YPX    6727       NaN     0      73M^M
--
  airline airline_id source source_id dest dest_i