In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sqlite3 import Error

## Using sqlite

In [2]:
### download the db from link in readme
db_path = "../../dbs/delays.db"

In [3]:
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        connection.text_factory = str
    except Error as e:
        print("Error occurred: " + str(e))
    return connection


def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        if query == "":
            return "Query Blank"
        else:
            df = pd.read_sql_query(query, connection)
            return df
    except Error as e:
        return "Error occurred: " + str(e)

In [4]:
connection = create_connection(db_path)

### Case 1: Number of late arrivals, but had nothing to do with late departures

In [5]:
sample_query = \
    """
        select 
         count(*)
        from 
          delay 
        where 
          DepDelay=0 and ArrDelay!=0
    """

only_late_arrivals_count =  execute_query(connection, sample_query)

In [6]:
only_late_arrivals_count

Unnamed: 0,count(*)
0,2339257


### Case 2: Number of late departures, which did not cause delay in arrivals

In [8]:
sample_query = \
    """
        select 
         count(*)
        from 
          delay 
        where 
          DepDelay!=0 and ArrDelay=0
    """

only_late_departures_count =  execute_query(connection, sample_query)

In [9]:
only_late_departures_count

Unnamed: 0,count(*)
0,996999


In [10]:
sample_query = \
    """
        select 
         count(*)
        from 
          delay 
        where 
          DepDelay>0 and ArrDelay>0
    """

both_late_count =  execute_query(connection, sample_query)

In [11]:
both_late_count

Unnamed: 0,count(*)
0,43317820


In [12]:
sample_query = \
    """
        select 
         count(*)
        from 
          delay 
        where 
          DepDelay=0 and ArrDelay=0
    """


not_late_count = execute_query(connection, sample_query)

In [13]:
not_late_count

Unnamed: 0,count(*)
0,1044442


In [14]:
sample_query = \
    """
        select 
         count(*)
        from 
          delay 
    """


all_count = execute_query(connection, sample_query)

In [15]:
all_count

Unnamed: 0,count(*)
0,47698518


In [39]:
sample_query = \
    """
        select 
          ArrDelay, 
          DepDelay, 
          CarrierDelay, 
          WeatherDelay, 
          NASDelay, 
          SecurityDelay, 
          LateAircraftDelay 
        from 
          delay 
        where 
          DepDelay=0 and ArrDelay!=0
    """

only_late_arrivals = execute_query(connection, sample_query)

In [40]:
(only_late_arrivals>1).sum()

ArrDelay             2194018
DepDelay                   0
CarrierDelay            4131
WeatherDelay             563
NASDelay              128857
SecurityDelay              2
LateAircraftDelay          4
dtype: int64

In [27]:
sample_query = \
    """
        select 
          ArrDelay, 
          DepDelay, 
          CarrierDelay, 
          WeatherDelay, 
          NASDelay, 
          SecurityDelay, 
          LateAircraftDelay 
        from 
          delay 
        where 
          DepDelay!=0 and ArrDelay=0
    """

only_late_departures = execute_query(connection, sample_query)

In [29]:
(only_late_departures>0).sum()

ArrDelay                  0
DepDelay             996999
CarrierDelay              0
WeatherDelay              0
NASDelay                  0
SecurityDelay             0
LateAircraftDelay         0
dtype: int64

In [44]:
sample_query = \
    """
        select 
          ArrDelay, 
          DepDelay, 
          CarrierDelay, 
          WeatherDelay, 
          NASDelay, 
          SecurityDelay, 
          LateAircraftDelay 
        from 
          delay 
        where 
          DepDelay!=0 and ArrDelay!=0
    """

both_late = execute_query(connection, sample_query)

In [45]:
both_late

Unnamed: 0,ArrDelay,DepDelay,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,4,5,0,0,0,0,0
1,20,11,11,0,9,0,0
2,2,9,0,0,0,0,0
3,7,1,0,0,0,0,0
4,16,6,0,0,0,0,0
...,...,...,...,...,...,...,...
43317815,11,5,0,0,0,0,0
43317816,17,10,0,0,0,0,0
43317817,3,2,0,0,0,0,0
43317818,1,1,0,0,0,0,0


In [46]:
(both_late>0).sum()

ArrDelay             43317820
DepDelay             43317820
CarrierDelay          4317837
WeatherDelay           485096
NASDelay              4294164
SecurityDelay           31001
LateAircraftDelay     4269470
dtype: int64

In [48]:
sample_query = \
    """
        select 
          ArrDelay, 
          DepDelay, 
          CarrierDelay, 
          WeatherDelay, 
          NASDelay, 
          SecurityDelay, 
          LateAircraftDelay 
        from 
          delay 
        where 
          DepDelay=0 and ArrDelay=0
    """

not_late = execute_query(connection, sample_query)

In [49]:
(not_late>1).sum()

ArrDelay             0
DepDelay             0
CarrierDelay         0
WeatherDelay         0
NASDelay             0
SecurityDelay        0
LateAircraftDelay    0
dtype: int64

In [57]:
sample_query = \
    """
        select 
          count(*)
        from 
          delay 
        where 
          (DepDelay>0 or ArrDelay>0) and (CarrierDelay+WeatherDelay+NASDelay+SecurityDelay+LateAircraftDelay)=0
    """

missing_count = execute_query(connection, sample_query)

In [58]:
missing_count

Unnamed: 0,count(*)
0,38291333


In [61]:
sample_query = \
    """
        select 
          count(*)
        from 
          delay 
        where 
          (DepDelay>0 or ArrDelay>0) 
    """

delays_count = execute_query(connection, sample_query)

In [63]:
missing_count/delays_count

Unnamed: 0,count(*)
0,0.82075


https://docs.google.com/spreadsheets/d/1qOYd4FlanBMzBeVONpaUvONmv46YaPCB_l2JQbjzvOU/edit?usp=sharing