In [96]:
import pandas as pd
import numpy as np

# Raw Data Playground
Thisi notebook is meant for playing with the raw data and experimenting. To use it, become familiar with [pandas](https://pandas.pydata.org/docs/).

## Loading the data
The Raw data can be loaded into any notebook like so:

In [2]:
from code_snippets.raw_data_loader import *

(Feel free to take a look at the code in the code_snippets/ directory)

This gives us the following clearly named pandas dataframes:

In [3]:
boundary_raw_df.head(5) # .head() here and in the following few cells just grabs the first few elements

Unnamed: 0,0,1
0,Alaska,"POLYGON((-141.0205 70.0187,-141.7291 70.1292,-..."
1,"AlabamaPOLYGON((-88.1955 35.0041,-85.6068 34.9...",
2,Arkansas,"POLYGON((-94.0416 33.0225,-91.2057 33.0075,-91..."
3,Arizona,"POLYGON((-112.5989 36.9993,-110.8630 37.0004,-..."
4,California,"POLYGON((-124.4009 41.9983,-123.6237 42.0024,-..."


In [4]:
airlines_raw_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1,Unknown,\N,-,,\N,\N,Y
1,1,Private flight,\N,-,,,,Y
2,2,135 Airways,\N,,GNL,GENERAL,United States,N
3,3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N


In [5]:
airports_raw_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [6]:
routes_raw_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


For the .dat files the loader assumes values are seperated but commas, and the .tsv files are assumed to have tab seperated values. Of course, not all of the rows were stored well, so there are missing values, empty columns, and other bad data.

Once we've figured out how to clean the data, Jared will turn the code into library functions like the code_snippets.raw_data_loader above and we can make a new playground to explore the cleaned data.

In [7]:
# Start playing here!

# Data Cleaning: What's been done so far
So far, 
- null-ish value have been set to NaN
- new dataframes have been created for the clean data:
    - airports_df
    - airlines_df
    - routes_df
    - boundary_df

# Cleaning Airports Data

In [41]:
# set  index given by data
airports_df = airports_raw_df.set_index(airports_raw_df[0])[[0,1,2,3,4,5,6,7,8,9,10,11,12,13]]

In [42]:
# Rename columns
airport_col_names = {
    0: "Airport ID",
    1: "Name",
    2: "City",
    3: "Country",
    4: "IATA",
    5: "ICAO",
    6: "Latitude",
    7: "Longitude",
    8: "Altitude",
    9: "Timezone",
    10: "DST",
    11: "Tz database timezone",
    12: "Type",
    13: "Source"
}
airports_df = airports_df.rename(columns=airport_col_names)

In [97]:
# replace '\N' with None
airports_df = airports_df.replace('\\N', np.nan)

In [113]:
# I noticed an "-", so we'll make those null too
airports_df.replace(['-'], np.nan, inplace=True)
airports_df.head()

Unnamed: 0,Airline ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,-1,,,,,,,Y
1,1,Private flight,,,,,,Y
2,2,135 Airways,,,GNL,GENERAL,United States,N
3,3,1Time Airline,,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,,,WYT,,United Kingdom,N


In [118]:
# I noticed an "Unknown", so we'll make those null too
airports_df.replace(['Unknown'], np.nan, inplace=True)
airports_df.head()

Unnamed: 0_level_0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database timezone,Type,Source
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
2,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
3,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
4,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
5,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [119]:
# Finding the null values
airports_df.isna().sum()

Airport ID               0
Name                     0
City                    49
Country                  0
IATA                     0
ICAO                     0
Latitude                 0
Longitude                0
Altitude                 0
Timezone                 0
DST                      0
Tz database timezone     0
Type                     0
Source                   0
dtype: int64

In [120]:
# They're all in 'City'.
airports_df[airports_df['City'].isna()].head()

Unnamed: 0_level_0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database timezone,Type,Source
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11794,11794,Minsk Mazowiecki Military Air Base,,Poland,BXP,EPMM,52.195499,21.655899,604,-5,A,Europe/Moscow,airport,OurAirports
11795,11795,Powidz Military Air Base,,Poland,BXP,EPPW,52.379398,17.853901,371,-5,A,Europe/Moscow,airport,OurAirports
11900,11900,King Salman Abdulaziz Airport,,Saudi Arabia,DWD,OEDM,24.4499,44.121201,3026,-5,A,Europe/Moscow,airport,OurAirports
11901,11901,King Khaled Air Base,,Saudi Arabia,KMX,OEKM,18.2973,42.803501,6778,-5,A,Europe/Moscow,airport,OurAirports
11921,11921,Asahikawa Airfield,,Japan,WWA,RJCA,43.794734,142.365432,377,-5,A,Europe/Moscow,airport,OurAirports


We could look these up and fill the values in, but we should only do this  if other data refers to these airports

# Cleaning Airline Data

In [88]:
# rename columns
airline_col_names = {
    0: "Airline ID",
    1: "Name",
    2: "Alias",
    3: "IATA",
    4: "ICAO",
    5: "Callsign",
    6: "Country",
    7: "Active"
}
airlines_df = airlines_raw_df.rename(columns=airline_col_names)
airlines_df.head()

Unnamed: 0,Airline ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,-1,Unknown,\N,-,,\N,\N,Y
1,1,Private flight,\N,-,,,,Y
2,2,135 Airways,\N,,GNL,GENERAL,United States,N
3,3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N


In [100]:
# Replace '\N' with None
airlines_df.replace(['\\N'], np.nan, inplace=True)
airlines_df.head()

Unnamed: 0,Airline ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,-1,Unknown,,-,,,,Y
1,1,Private flight,,-,,,,Y
2,2,135 Airways,,,GNL,GENERAL,United States,N
3,3,1Time Airline,,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,,,WYT,,United Kingdom,N


In [104]:
# I noticed an "Unknown value", so we'll make those null too
airlines_df.replace(['Unknown'], np.nan, inplace=True)
airlines_df.head()

Unnamed: 0,Airline ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,-1,,,-,,,,Y
1,1,Private flight,,-,,,,Y
2,2,135 Airways,,,GNL,GENERAL,United States,N
3,3,1Time Airline,,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,,,WYT,,United Kingdom,N


In [106]:
# I noticed an "-", so we'll make those null too
airlines_df.replace(['-'], np.nan, inplace=True)
airlines_df.head()

Unnamed: 0,Airline ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,-1,,,,,,,Y
1,1,Private flight,,,,,,Y
2,2,135 Airways,,,GNL,GENERAL,United States,N
3,3,1Time Airline,,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,,,WYT,,United Kingdom,N


In [107]:
# Here's where the null data lives
airlines_df.isna().sum()

Airline ID       0
Name             1
Alias         1723
IATA          4630
ICAO            87
Callsign       809
Country         16
Active           0
dtype: int64

This is an infeasible number of null values to go through and fill in, so we need to either filter them out, or ensure we have checks in the functions that use this data

# Cleaning Route Data

In [114]:
# Set column names
route_col_names = {
    0: "Airline",
    1: "Airline ID",
    2: "Source airport",
    3: "Source airport ID",
    4: "Destination airport",
    5: "Destination airport ID",
    6: "Codeshare",
    7: "Stops",
    8: "Equipment"
}
routes_df = routes_raw_df.rename(columns=route_col_names)
routes_df.head()

Unnamed: 0,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


In [115]:
# Replace '\N' with None
routes_df.replace(['\\N'], np.nan, inplace=True)
routes_df.head()

Unnamed: 0,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


In [116]:
# I noticed an "Unknown value", so we'll make those null too
routes_df.replace(['Unknown'], np.nan, inplace=True)
routes_df.head()

Unnamed: 0,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


In [117]:
# I noticed an "-", so we'll make those null too
routes_df.replace(['-'], np.nan, inplace=True)
routes_df.head()

Unnamed: 0,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


# Cleaning Boundary Data

In [121]:
boundary_col_names = {
    0: 'State',
    1: 'Polygon'
}
boundary_df = boundary_raw_df.rename(columns=boundary_col_names)
boundary_df.head()

Unnamed: 0,State,Polygon
0,Alaska,"POLYGON((-141.0205 70.0187,-141.7291 70.1292,-..."
1,"AlabamaPOLYGON((-88.1955 35.0041,-85.6068 34.9...",
2,Arkansas,"POLYGON((-94.0416 33.0225,-91.2057 33.0075,-91..."
3,Arizona,"POLYGON((-112.5989 36.9993,-110.8630 37.0004,-..."
4,California,"POLYGON((-124.4009 41.9983,-123.6237 42.0024,-..."


In [124]:
# Finding bad state names by seeing where polygon is null
boundary_df.isna().sum()

State      0
Polygon    2
dtype: int64

In [142]:
# Row 1 and 14 have this form
boundary_df.loc[:, boundary_df.isna().any()]

Unnamed: 0,Polygon
0,"POLYGON((-141.0205 70.0187,-141.7291 70.1292,-..."
1,
2,"POLYGON((-94.0416 33.0225,-91.2057 33.0075,-91..."
3,"POLYGON((-112.5989 36.9993,-110.8630 37.0004,-..."
4,"POLYGON((-124.4009 41.9983,-123.6237 42.0024,-..."
5,"POLYGON((-109.0448 37.0004,-102.0424 36.9949,-..."
6,"POLYGON((-73.4875 42.0498,-73.4247 42.0511,-72..."
7,"POLYGON((-75.7919 39.7188,-75.7837 39.5210,-75..."
8,"POLYGON((-87.6050 30.9988,-86.5613 30.9964,-85..."
9,"POLYGON((-85.6082 34.9974,-84.7266 34.9906,-84..."


In [146]:
boundary_df.iloc[1]['State']

'AlabamaPOLYGON((-88.1955 35.0041,-85.6068 34.9918,-85.1756 32.8404,-84.8927 32.2593,-85.0342 32.1535,-85.1358 31.7947,-85.0438 31.5200,-85.0836 31.3384,-85.1070 31.2093,-84.9944 31.0023,-87.6009 30.9953,-87.5926 30.9423,-87.6256 30.8539,-87.4072 30.6745,-87.3688 30.4404,-87.5240 30.1463,-88.3864 30.1546,-88.4743 31.8939,-88.1021 34.8938,-88.1721 34.9479,-88.1461 34.9107,-88.1955 35.0041))'

In [148]:
# Breaking apart the cell above
boundary_df.iloc[1]['State'] = 'Alabama'
boundary_df.iloc[1]['Polygon'] = 'POLYGON((-88.1955 35.0041,-85.6068 34.9918,-85.1756 32.8404,-84.8927 32.2593,-85.0342 32.1535,-85.1358 31.7947,-85.0438 31.5200,-85.0836 31.3384,-85.1070 31.2093,-84.9944 31.0023,-87.6009 30.9953,-87.5926 30.9423,-87.6256 30.8539,-87.4072 30.6745,-87.3688 30.4404,-87.5240 30.1463,-88.3864 30.1546,-88.4743 31.8939,-88.1021 34.8938,-88.1721 34.9479,-88.1461 34.9107,-88.1955 35.0041))'
boundary_df.iloc[1]

State                                                Alabama
Polygon    POLYGON((-88.1955 35.0041,-85.6068 34.9918,-85...
Name: 1, dtype: object

In [149]:
boundary_df.iloc[14]['State']

'IndianaPOLYGON((-87.5253 41.7611,-84.8090 41.7611,-84.8199 39.0981,-84.8927 39.0533,-84.8625 38.8996,-84.8268 38.8312,-84.8145 38.7841,-84.8941 38.7905,-84.9861 38.7809,-85.1797 38.6877,-85.4420 38.7198,-85.4091 38.5653,-85.5986 38.4461,-85.7510 38.2695,-85.8266 38.2824,-85.8376 38.2414,-85.9035 38.0967,-85.9200 38.0232,-86.0477 37.9594,-86.0944 38.0102,-86.2729 38.0578,-86.2811 38.0935,-86.2729 38.1346,-86.3704 38.1842,-86.5187 38.0416,-86.5874 37.9193,-86.6409 37.8402,-86.6478 37.9085,-86.6876 37.9085,-86.8236 37.9821,-86.9019 37.9464,-87.0392 37.9009,-87.1394 37.7924,-87.4429 37.9464,-87.5885 37.9756,-87.6283 37.9225,-87.6915 37.8694,-87.8879 37.9236,-87.9620 37.7718,-88.0321 37.7870,-88.0376 37.8092,-88.0643 37.8011,-88.0925 37.8206,-88.0451 37.8223,-88.0575 37.8483,-88.0980 37.9041,-88.0705 37.9307,-88.0369 37.9561,-88.0122 37.9669,-88.0259 38.0102,-88.0417 38.0384,-88.0005 38.0530,-87.9607 38.0762,-88.0163 38.1000,-87.9710 38.1313,-87.9284 38.1497,-87.9387 38.1734,-87.9730 38.19

In [150]:
# Breaking apart the cell above
boundary_df.iloc[14]['State'] = 'Indiana'
boundary_df.iloc[14]['Polygon'] = 'POLYGON((-87.5253 41.7611,-84.8090 41.7611,-84.8199 39.0981,-84.8927 39.0533,-84.8625 38.8996,-84.8268 38.8312,-84.8145 38.7841,-84.8941 38.7905,-84.9861 38.7809,-85.1797 38.6877,-85.4420 38.7198,-85.4091 38.5653,-85.5986 38.4461,-85.7510 38.2695,-85.8266 38.2824,-85.8376 38.2414,-85.9035 38.0967,-85.9200 38.0232,-86.0477 37.9594,-86.0944 38.0102,-86.2729 38.0578,-86.2811 38.0935,-86.2729 38.1346,-86.3704 38.1842,-86.5187 38.0416,-86.5874 37.9193,-86.6409 37.8402,-86.6478 37.9085,-86.6876 37.9085,-86.8236 37.9821,-86.9019 37.9464,-87.0392 37.9009,-87.1394 37.7924,-87.4429 37.9464,-87.5885 37.9756,-87.6283 37.9225,-87.6915 37.8694,-87.8879 37.9236,-87.9620 37.7718,-88.0321 37.7870,-88.0376 37.8092,-88.0643 37.8011,-88.0925 37.8206,-88.0451 37.8223,-88.0575 37.8483,-88.0980 37.9041,-88.0705 37.9307,-88.0369 37.9561,-88.0122 37.9669,-88.0259 38.0102,-88.0417 38.0384,-88.0005 38.0530,-87.9607 38.0762,-88.0163 38.1000,-87.9710 38.1313,-87.9284 38.1497,-87.9387 38.1734,-87.9730 38.1939,-87.9813 38.2349,-87.9421 38.2608,-87.8604 38.2759,-87.8302 38.3029,-87.8350 38.3233,-87.8137 38.3567,-87.7739 38.3767,-87.7444 38.4116,-87.6448 38.5149,-87.6723 38.5460,-87.6105 38.5949,-87.6242 38.5986,-87.5343 38.6828,-87.5075 38.7284,-87.4972 38.7696,-87.5322 38.8247,-87.5171 38.9039,-87.5253 38.9413,-87.5281 38.9712,-87.5761 38.9872,-87.6228 39.0906,-87.6517 39.1066,-87.6599 39.1365,-87.6366 39.1695,-87.5899 39.2493,-87.5336 39.3492,-87.5253 41.7600,-87.5253 41.7611))'
boundary_df.iloc[14]

State                                                Indiana
Polygon    POLYGON((-87.5253 41.7611,-84.8090 41.7611,-84...
Name: 14, dtype: object

In [154]:
print(f"Tuples in airports: {airports_df.shape[0]}")
print(f"Tuples in airlines: {airlines_df.shape[0]}")
print(f"Tuples in routes: {routes_df.shape[0]}")
print(f"Tuples in boundarys: {boundary_df.shape[0]}")


print(f"Attributes in airports: {airports_df.shape[1]}")
print(f"Attributes in airlines: {airlines_df.shape[1]}")
print(f"Attributes in routes: {routes_df.shape[1]}")
print(f"Attributes in boundarys: {boundary_df.shape[1]}")

Tuples in airports: 7698
Tuples in airlines: 6162
Tuples in routes: 67663
Tuples in boundarys: 50
Attributes in airports: 14
Attributes in airlines: 8
Attributes in routes: 9
Attributes in boundarys: 2


(6162, 8)

In [160]:
airports_df.to_csv("clean_data/airports.dat", header=False, index=False)
airlines_df.to_csv("clean_data/airlines.dat", header=False, index=False)
routes_df.to_csv("clean_data/routes.dat", header=False, index=False)
boundary_df.to_csv("clean_data/boundary-each-state.tsv", sep="\t", header=False, index=False)

# cities

In [161]:
cities_df = pd.read_csv("raw_data/cities.csv")

In [163]:
cities_df.head()

Unnamed: 0,Pullman,-117.167126_46.736689
0,Phoenix,-112.092128_33.505533
