## Imports

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine

In [2]:
cols = ['Year',
    'Quarter',
    'Month',
    'DayofMonth',
    'DayOfWeek',
    'FlightDate',
    'DOT_ID_Reporting_Airline',
    'IATA_CODE_Reporting_Airline',
    'Tail_Number',
    'Flight_Number_Reporting_Airline',
    'OriginAirportID',
    'DestAirportID',
    'CRSDepTime',
    'DepTime',
    'DepDelay',
    'DepDelayMinutes',
    'CRSArrTime',
    'ArrTime',
    'ArrDelay',
    'ArrDelayMinutes',
    'Cancelled',
    'CancellationCode',
    'Diverted',
    'CRSElapsedTime',
    'ActualElapsedTime',
    'AirTime',
    'Flights',
    'Distance',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay'
]

In [3]:
file_path = "../../datasets/delays.csv"
db_path = "../../dbs/delays.db"

In [4]:
df_sample = pd.read_csv(file_path, names=cols, nrows=100)

In [5]:
conn = sqlite3.connect(db_path)
conn.execute("DROP TABLE IF EXISTS delay;")

create_query = ''' \
            CREATE TABLE delay
            (
                Year                            SMALLINT UNSIGNED,
                Quarter                         TINYINT UNSIGNED,
                Month                           TINYINT UNSIGNED,
                DayofMonth                      TINYINT UNSIGNED,
                DayOfWeek                       TINYINT UNSIGNED,
                FlightDate                      DATE,
                DOT_ID_Reporting_Airline        INT,
                IATA_CODE_Reporting_Airline     VARCHAR,
                Tail_Number                     VARCHAR,
                Flight_Number_Reporting_Airline VARCHAR,
                OriginAirportID                 INT,
                DestAirportID                   INT,
                CRSDepTime                      INT,
                DepTime                         INT,
                DepDelay                        INT,
                DepDelayMinutes                 INT,
                CRSArrTime                      INT,
                ArrTime                         INT,
                ArrDelay                        INT,
                ArrDelayMinutes                 INT,
                Cancelled                       TINYINT UNSIGNED,
                CancellationCode                VARCHAR,
                Diverted                        TINYINT UNSIGNED,
                CRSElapsedTime                  INT,
                ActualElapsedTime               INT,
                AirTime                         INT,
                Flights                         INT,
                Distance                        INT,
                CarrierDelay                    INT,
                WeatherDelay                    INT,
                NASDelay                        INT,
                SecurityDelay                   INT,
                LateAircraftDelay               INT
            );
    '''
conn.execute(create_query)
conn.close()
engine = sqlalchemy.create_engine("sqlite:///"+db_path, echo=False)

In [6]:
def add_to_db(chunk):
    chunk.to_sql('delay', con=engine, if_exists='append', index=False)

In [7]:
chunksize = 500000
with pd.read_csv(file_path, chunksize=chunksize, low_memory=False, names=cols) as reader:
    for chunk in reader:
        add_to_db(chunk)

In [8]:
conn = sqlite3.connect(db_path)

In [9]:
query = """ \
    select Year, count(*) as Records from delay
    group by 1
    order by 1
"""

df = pd.read_sql_query(query, conn)

In [10]:
df

Unnamed: 0,Year,Records
0,2014,5819811
1,2015,5819079
2,2016,5617658
3,2017,5674621
4,2018,7213446
5,2019,7422037
6,2020,4688354
7,2021,5443512
