In [None]:
# Data from
# https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236

# Airport timezones
# https://gist.github.com/andrewcole/8760689

In [1]:
import pandas as pd 
import numpy as np
import datetime
from pytz import timezone

In [2]:
tz = pd.read_csv("https://gist.githubusercontent.com/mj1856/6d219c48697c550c2476/raw/ce1bc0666772fd9bcd839318b713836b78bba9be/timezones.csv")

In [3]:
tz = tz.set_index('iata_code').drop('windows_tz', axis='columns')

In [4]:
#Get a airport code - return the Time Zone
def get_time_zone(x):
    return tz.loc[x].iana_tz

get_time_zone('JFK')

'America/New_York'

In [5]:
df = pd.read_table("JAN2018.csv", sep=",", dtype = 'object')

In [6]:
df.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,Unnamed: 18
0,2018-01-01,UA,2429,EWR,DEN,1517,1512,-5.0,1745,1722,-23.0,0.0,,0.0,268.0,250.0,225.0,1605.0,
1,2018-01-01,UA,2427,LAS,SFO,1115,1107,-8.0,1254,1230,-24.0,0.0,,0.0,99.0,83.0,65.0,414.0,
2,2018-01-01,UA,2426,SNA,DEN,1335,1330,-5.0,1649,1636,-13.0,0.0,,0.0,134.0,126.0,106.0,846.0,
3,2018-01-01,UA,2425,RSW,ORD,1546,1552,6.0,1756,1754,-2.0,0.0,,0.0,190.0,182.0,157.0,1120.0,
4,2018-01-01,UA,2424,ORD,ALB,630,650,20.0,922,936,14.0,0.0,,0.0,112.0,106.0,83.0,723.0,


In [7]:
df.dtypes

FL_DATE                object
OP_UNIQUE_CARRIER      object
OP_CARRIER_FL_NUM      object
ORIGIN                 object
DEST                   object
CRS_DEP_TIME           object
DEP_TIME               object
DEP_DELAY              object
CRS_ARR_TIME           object
ARR_TIME               object
ARR_DELAY              object
CANCELLED              object
CANCELLATION_CODE      object
DIVERTED               object
CRS_ELAPSED_TIME       object
ACTUAL_ELAPSED_TIME    object
AIR_TIME               object
DISTANCE               object
Unnamed: 18            object
dtype: object

In [8]:
def to_time(x):
    if type(x)==type('str') and len(x)==4:
        if x == '2400':
            return '00:00'
        else:
            return x[:2]+':'+x[2:]
    else:
        return x

In [9]:
#Convert to Time 
df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].astype(str).apply(to_time)
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].astype(str).apply(to_time)

df['DEP_TIME']     = df['DEP_TIME'].astype(str).apply(to_time)
df['ARR_TIME']     = df['ARR_TIME'].astype(str).apply(to_time)

In [10]:
df['CRS_DEP_TIME'] = df['FL_DATE'] + " " + df['CRS_DEP_TIME']
df['CRS_ARR_TIME'] = df['FL_DATE'] + " " + df['CRS_ARR_TIME']
df['DEP_TIME'] = df['FL_DATE'] + " " + df['DEP_TIME']
df['ARR_TIME'] = df['FL_DATE'] + " " + df['ARR_TIME']


In [11]:
df['CRS_DEP_TIME'] = pd.to_datetime(df.CRS_DEP_TIME, format='%Y-%m-%d %H:%M')
df['CRS_ARR_TIME'] = pd.to_datetime(df.CRS_ARR_TIME, format='%Y-%m-%d %H:%M')
df['DEP_TIME'] = pd.to_datetime(df.DEP_TIME, format='%Y-%m-%d %H:%M', errors= 'coerce')
df['ARR_TIME'] = pd.to_datetime(df.ARR_TIME, format='%Y-%m-%d %H:%M', errors= 'coerce')

In [12]:
def convert(row):
    tz = timezone(get_time_zone(row['ORIGIN']))
    return tz.localize(row['CRS_DEP_TIME'])
    
df['CRS_DEP_TIME'] = df.apply(convert, axis='columns')

In [13]:
def convert(row):
    if pd.notna(row['DEP_TIME']):
        tz = timezone(get_time_zone(row['ORIGIN']))
        return tz.localize(row['DEP_TIME'])
    else:
        return None
    
df['DEP_TIME'] = df.apply(convert, axis='columns')

In [14]:
def convert_dest_time(row):
    if pd.notna(row['ARR_TIME']):
        origin = row['DEST']
        origin_tz = timezone(get_time_zone(origin))
        dep_time = row['ARR_TIME']
        return origin_tz.localize(dep_time)
    else:
        return None
    
df['ARR_TIME'] = df.apply(convert_dest_time, axis='columns')

In [15]:
def convert_dest_time(row):
    origin = row['DEST']
    origin_tz = timezone(get_time_zone(origin))
    dep_time = row['CRS_ARR_TIME']
    return origin_tz.localize(dep_time)
    
df['CRS_ARR_TIME'] = df.apply(convert_dest_time, axis='columns')

In [16]:
df.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,Unnamed: 18
0,2018-01-01,UA,2429,EWR,DEN,2018-01-01 15:17:00-05:00,2018-01-01 15:12:00-05:00,-5.0,2018-01-01 17:45:00-07:00,2018-01-01 17:22:00-07:00,-23.0,0.0,,0.0,268.0,250.0,225.0,1605.0,
1,2018-01-01,UA,2427,LAS,SFO,2018-01-01 11:15:00-08:00,2018-01-01 11:07:00-08:00,-8.0,2018-01-01 12:54:00-08:00,2018-01-01 12:30:00-08:00,-24.0,0.0,,0.0,99.0,83.0,65.0,414.0,
2,2018-01-01,UA,2426,SNA,DEN,2018-01-01 13:35:00-08:00,2018-01-01 13:30:00-08:00,-5.0,2018-01-01 16:49:00-07:00,2018-01-01 16:36:00-07:00,-13.0,0.0,,0.0,134.0,126.0,106.0,846.0,
3,2018-01-01,UA,2425,RSW,ORD,2018-01-01 15:46:00-05:00,2018-01-01 15:52:00-05:00,6.0,2018-01-01 17:56:00-06:00,2018-01-01 17:54:00-06:00,-2.0,0.0,,0.0,190.0,182.0,157.0,1120.0,
4,2018-01-01,UA,2424,ORD,ALB,2018-01-01 06:30:00-06:00,2018-01-01 06:50:00-06:00,20.0,2018-01-01 09:22:00-05:00,2018-01-01 09:36:00-05:00,14.0,0.0,,0.0,112.0,106.0,83.0,723.0,


In [17]:
df['DEP_TIME'].loc[0] > df['ARR_TIME'].loc[0]

False

In [18]:
df = df.drop(['FL_DATE', 'Unnamed: 18'], axis='columns')

In [19]:
df = df.rename(
    {
       "OP_UNIQUE_CARRIER": "AIRLINE",
        "OP_CARRIER_FL_NUM": "FLIGHT_NUM",
        'CRS_DEP_TIME': 'SCHED_DEP',
        'DEP_TIME': 'ACTUAL_DEP',
        'CRS_ARR_TIME': 'SCHED_ARR',
        'ARR_TIME': 'ACTUAL_ARR',
        
    },
    axis = 'columns'
)

In [20]:
df.dtypes

AIRLINE                object
FLIGHT_NUM             object
ORIGIN                 object
DEST                   object
SCHED_DEP              object
ACTUAL_DEP             object
DEP_DELAY              object
SCHED_ARR              object
ACTUAL_ARR             object
ARR_DELAY              object
CANCELLED              object
CANCELLATION_CODE      object
DIVERTED               object
CRS_ELAPSED_TIME       object
ACTUAL_ELAPSED_TIME    object
AIR_TIME               object
DISTANCE               object
dtype: object

In [21]:
df.head(10)

Unnamed: 0,AIRLINE,FLIGHT_NUM,ORIGIN,DEST,SCHED_DEP,ACTUAL_DEP,DEP_DELAY,SCHED_ARR,ACTUAL_ARR,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE
0,UA,2429,EWR,DEN,2018-01-01 15:17:00-05:00,2018-01-01 15:12:00-05:00,-5.0,2018-01-01 17:45:00-07:00,2018-01-01 17:22:00-07:00,-23.0,0.0,,0.0,268.0,250.0,225.0,1605.0
1,UA,2427,LAS,SFO,2018-01-01 11:15:00-08:00,2018-01-01 11:07:00-08:00,-8.0,2018-01-01 12:54:00-08:00,2018-01-01 12:30:00-08:00,-24.0,0.0,,0.0,99.0,83.0,65.0,414.0
2,UA,2426,SNA,DEN,2018-01-01 13:35:00-08:00,2018-01-01 13:30:00-08:00,-5.0,2018-01-01 16:49:00-07:00,2018-01-01 16:36:00-07:00,-13.0,0.0,,0.0,134.0,126.0,106.0,846.0
3,UA,2425,RSW,ORD,2018-01-01 15:46:00-05:00,2018-01-01 15:52:00-05:00,6.0,2018-01-01 17:56:00-06:00,2018-01-01 17:54:00-06:00,-2.0,0.0,,0.0,190.0,182.0,157.0,1120.0
4,UA,2424,ORD,ALB,2018-01-01 06:30:00-06:00,2018-01-01 06:50:00-06:00,20.0,2018-01-01 09:22:00-05:00,2018-01-01 09:36:00-05:00,14.0,0.0,,0.0,112.0,106.0,83.0,723.0
5,UA,2422,ORD,OMA,2018-01-01 22:41:00-06:00,2018-01-01 22:44:00-06:00,3.0,2018-01-01 00:14:00-06:00,2018-01-01 00:03:00-06:00,-11.0,0.0,,0.0,93.0,79.0,62.0,416.0
6,UA,2421,IAH,LAS,2018-01-01 07:50:00-06:00,2018-01-01 07:47:00-06:00,-3.0,2018-01-01 09:16:00-08:00,2018-01-01 09:00:00-08:00,-16.0,0.0,,0.0,206.0,193.0,173.0,1222.0
7,UA,2420,DEN,CID,2018-01-01 13:24:00-07:00,2018-01-01 13:18:00-07:00,-6.0,2018-01-01 16:19:00-06:00,2018-01-01 16:00:00-06:00,-19.0,0.0,,0.0,115.0,102.0,85.0,692.0
8,UA,2419,SMF,EWR,2018-01-01 22:24:00-08:00,2018-01-01 22:37:00-08:00,13.0,2018-01-01 06:38:00-05:00,2018-01-01 06:36:00-05:00,-2.0,0.0,,0.0,314.0,299.0,280.0,2500.0
9,UA,2418,RIC,DEN,2018-01-01 16:01:00-05:00,2018-01-01 15:59:00-05:00,-2.0,2018-01-01 18:13:00-07:00,2018-01-01 17:56:00-07:00,-17.0,0.0,,0.0,252.0,237.0,217.0,1482.0


In [24]:
def fix_arr(row):
    x = row['SCHED_ARR']
    if row['SCHED_DEP'] > row['SCHED_ARR']:
        return row['SCHED_ARR'] + datetime.timedelta(days=1)
    else:
        return row['SCHED_ARR']
    
df['SCHED_ARR'] = df.apply(fix_arr, axis='columns')


In [25]:
df[ df]

Unnamed: 0,AIRLINE,FLIGHT_NUM,ORIGIN,DEST,SCHED_DEP,ACTUAL_DEP,DEP_DELAY,SCHED_ARR,ACTUAL_ARR,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE
0,UA,2429,EWR,DEN,2018-01-01 15:17:00-05:00,2018-01-01 15:12:00-05:00,-5.0,2018-01-01 17:45:00-07:00,2018-01-01 17:22:00-07:00,-23.0,0.0,,0.0,268.0,250.0,225.0,1605.0
1,UA,2427,LAS,SFO,2018-01-01 11:15:00-08:00,2018-01-01 11:07:00-08:00,-8.0,2018-01-01 12:54:00-08:00,2018-01-01 12:30:00-08:00,-24.0,0.0,,0.0,99.0,83.0,65.0,414.0
2,UA,2426,SNA,DEN,2018-01-01 13:35:00-08:00,2018-01-01 13:30:00-08:00,-5.0,2018-01-01 16:49:00-07:00,2018-01-01 16:36:00-07:00,-13.0,0.0,,0.0,134.0,126.0,106.0,846.0
3,UA,2425,RSW,ORD,2018-01-01 15:46:00-05:00,2018-01-01 15:52:00-05:00,6.0,2018-01-01 17:56:00-06:00,2018-01-01 17:54:00-06:00,-2.0,0.0,,0.0,190.0,182.0,157.0,1120.0
4,UA,2424,ORD,ALB,2018-01-01 06:30:00-06:00,2018-01-01 06:50:00-06:00,20.0,2018-01-01 09:22:00-05:00,2018-01-01 09:36:00-05:00,14.0,0.0,,0.0,112.0,106.0,83.0,723.0
5,UA,2422,ORD,OMA,2018-01-01 22:41:00-06:00,2018-01-01 22:44:00-06:00,3.0,2018-01-02 00:14:00-06:00,2018-01-01 00:03:00-06:00,-11.0,0.0,,0.0,93.0,79.0,62.0,416.0
6,UA,2421,IAH,LAS,2018-01-01 07:50:00-06:00,2018-01-01 07:47:00-06:00,-3.0,2018-01-01 09:16:00-08:00,2018-01-01 09:00:00-08:00,-16.0,0.0,,0.0,206.0,193.0,173.0,1222.0
7,UA,2420,DEN,CID,2018-01-01 13:24:00-07:00,2018-01-01 13:18:00-07:00,-6.0,2018-01-01 16:19:00-06:00,2018-01-01 16:00:00-06:00,-19.0,0.0,,0.0,115.0,102.0,85.0,692.0
8,UA,2419,SMF,EWR,2018-01-01 22:24:00-08:00,2018-01-01 22:37:00-08:00,13.0,2018-01-02 06:38:00-05:00,2018-01-01 06:36:00-05:00,-2.0,0.0,,0.0,314.0,299.0,280.0,2500.0
9,UA,2418,RIC,DEN,2018-01-01 16:01:00-05:00,2018-01-01 15:59:00-05:00,-2.0,2018-01-01 18:13:00-07:00,2018-01-01 17:56:00-07:00,-17.0,0.0,,0.0,252.0,237.0,217.0,1482.0


In [None]:
# Rename Columns 
df.rename({'FL_DATE': 'departureDate', 
            'OP_UNIQUE_CARRIER': 'airline',
            'OP_CARRIER_FL_NUM': 'flightNumber',
            'ORIGIN': 'departureAirport',
            'DEST': 'arrivalAirport',
            'CRS_DEP_TIME': 'scheduledDeparture',
            'DEP_TIME': 'actualDeparture',
            'CRS_ARR_TIME': 'scheduledArrival',
            'ARR_TIME': 'actualArrival',
            'ARR_DELAY': 'arrivalDelay',
            'CANCELLED': 'isCancelled',
            'DIVERTED': 'isDiverted',
            'CRS_ELAPSED_TIME': 'scheduledDuration',
            'ACTUAL_ELAPSED_TIME': 'actualDuration',
            'WEATHER_DELAY': 'weatherDelay',
            'NAS_DELAY': 'airTrafficDelay',
            'SECURITY_DELAY': 'securityDelay',
            'LATE_AIRCRAFT_DELAY': 'lateAircraftDelay',
            'CARRIER_DELAY': 'carrierDelay',
            'DEP_DELAY': 'departureDelay'
          }, axis='columns', inplace = True)

In [None]:

df.head()

In [None]:
# Connect to SQL 
from sqlalchemy import create_engine

conn_string = 'mysql://{user}:{password}@{host}/flights?charset=utf8mb4'.format(
    host = '35.237.67.210', 
    user = 'root',
    password = 'b8EzunlmLy2zNq87')

engine = create_engine(conn_string)
con = engine.connect()


In [None]:
#Send Data to SQL 
df.to_sql(name = "flight_data_new", con = engine, index = False, if_exists = 'append', chunksize=1000)
print("Done")