# Step 1 - automated data ingestion

In [23]:
import requests
from datetime import datetime, timedelta
from os import environ
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import plotly.express as px
from sqlalchemy.engine.url import URL
from sqlalchemy_utils import database_exists, create_database, drop_database

api_key = environ.get('aviation_key')
api_url = 'https://api.aviationstack.com/v1/flights'

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days) + 1):
        yield start_date + timedelta(n)

def fetch_flights_for_date(flight_date, airline):
    params = {
        'access_key': api_key,
        'flight_date': flight_date.strftime('%Y-%m-%d'),
        'airline_name': airline
    }
    
    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        data = response.json().get('data', [])
        return data
    else:
        print(f"Failed to fetch data for {flight_date}: {response.status_code}")
        return []

def fetch_flights(airlines, start_date_str, end_date_str):
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    all_flights = []

    for airline in airlines:
        for single_date in daterange(start_date, end_date):
            flights_on_date = fetch_flights_for_date(single_date, airline)
            all_flights.extend(flights_on_date)

    if all_flights:
        return pd.DataFrame(all_flights)
    else:
        return "No data available for the given parameters"




In [28]:
airline = ["United Airlines", "Scoot"]
start_date = "2024-02-15"  
end_date = "2024-02-20"  

df = fetch_flights(airline, start_date, end_date)

In [29]:
df

Unnamed: 0,flight_date,flight_status,departure,arrival,airline,flight,aircraft,live
0,2024-02-15,landed,"{'airport': 'Los Angeles International', 'time...","{'airport': 'Melbourne - Tullamarine Airport',...","{'name': 'United Airlines', 'iata': 'UA', 'ica...","{'number': '98', 'iata': 'UA98', 'icao': 'UAL9...","{'registration': 'N19986', 'iata': 'B789', 'ic...",
1,2024-02-15,landed,"{'airport': 'Denver International', 'timezone'...",{'airport': 'Fort Lauderdale–Hollywood Interna...,"{'name': 'United Airlines', 'iata': 'UA', 'ica...","{'number': '2328', 'iata': 'UA2328', 'icao': '...",,
2,2024-02-15,scheduled,"{'airport': 'Lester B. Pearson International',...","{'airport': 'Dublin International', 'timezone'...","{'name': 'United Airlines', 'iata': 'UA', 'ica...","{'number': '8683', 'iata': 'UA8683', 'icao': '...",,
3,2024-02-15,landed,"{'airport': 'Denver International', 'timezone'...","{'airport': 'Frankfurt International Airport',...","{'name': 'United Airlines', 'iata': 'UA', 'ica...","{'number': '8879', 'iata': 'UA8879', 'icao': '...",,
4,2024-02-15,landed,"{'airport': 'Denver International', 'timezone'...","{'airport': 'Tampa International', 'timezone':...","{'name': 'United Airlines', 'iata': 'UA', 'ica...","{'number': '717', 'iata': 'UA717', 'icao': 'UA...","{'registration': 'N36444', 'iata': 'B739', 'ic...",
...,...,...,...,...,...,...,...,...
1195,2024-02-20,landed,"{'airport': 'Tianhe International', 'timezone'...","{'airport': 'Singapore Changi', 'timezone': 'A...","{'name': 'Scoot', 'iata': 'TR', 'icao': 'TGW'}","{'number': '123', 'iata': 'TR123', 'icao': 'TG...","{'registration': '9V-NCD', 'iata': 'A21N', 'ic...",
1196,2024-02-20,landed,"{'airport': 'Phuket International', 'timezone'...","{'airport': 'Singapore Changi', 'timezone': 'A...","{'name': 'Scoot', 'iata': 'TR', 'icao': 'TGW'}","{'number': '653', 'iata': 'TR653', 'icao': 'TG...","{'registration': '9V-NCI', 'iata': 'A21N', 'ic...",
1197,2024-02-20,landed,"{'airport': 'Juanda', 'timezone': 'Asia/Jakart...","{'airport': 'Singapore Changi', 'timezone': 'A...","{'name': 'Scoot', 'iata': 'TR', 'icao': 'TGW'}","{'number': '267', 'iata': 'TR267', 'icao': 'TG...","{'registration': '9V-TRQ', 'iata': 'A320', 'ic...",
1198,2024-02-20,landed,"{'airport': 'Hasanudin', 'timezone': 'Asia/Mak...","{'airport': 'Singapore Changi', 'timezone': 'A...","{'name': 'Scoot', 'iata': 'TR', 'icao': 'TGW'}","{'number': '235', 'iata': 'TR235', 'icao': 'TG...",,


In [30]:
depart_tb = pd.DataFrame(df['departure'].to_list(), columns = ['airport', 'timezone','iata','icao',
                                                          'terminal','gate','delay','scheduled',
                                                          'estimated','actual','estimated_runway','actual_runway'])

arrival_tb = pd.DataFrame(df['arrival'].to_list(), columns = ['airport', 'timezone','iata','icao',
                                                          'terminal','gate','baggage','delay','scheduled',
                                                          'estimated','actual','estimated_runway','actual_runway'])

airline_tb = pd.DataFrame(df['airline'].to_list(), columns = ['name','iata','icao'])

flight_tb = pd.DataFrame(df['flight'].to_list(), columns = ['number','iata','icao','codeshared'])



aircraft_tb = pd.DataFrame(
    list(
        filter(
            lambda x:x != None,df['aircraft'].to_list()
        )
    ), columns = ['registration','iata','icao','icao24'])



In [31]:
 depart_tb['id'] = depart_tb['iata'].fillna('') + \
                            depart_tb['scheduled'].apply(lambda x: x if pd.notnull(x) else '') + \
                            depart_tb['estimated'].apply(lambda x: x if pd.notnull(x) else '') + \
                            depart_tb['actual'].apply(lambda x: x if pd.notnull(x) else '')

arrival_tb['id'] = arrival_tb['iata'].fillna('') + \
                            arrival_tb['scheduled'].apply(lambda x: x if pd.notnull(x) else '') + \
                            arrival_tb['estimated'].apply(lambda x: x if pd.notnull(x) else '') + \
                            arrival_tb['actual'].apply(lambda x: x if pd.notnull(x) else '')

In [33]:
#table used only for the id creation at df table
aircraft_aux = [{
    'registration': aircraft.get('registration') if aircraft else None,
    'iata': aircraft.get('iata') if aircraft else None,
    'icao': aircraft.get('icao') if aircraft else None,
    'icao24': aircraft.get('icao24') if aircraft else None
} for aircraft in df['aircraft']]

# Convert the list of dictionaries to a DataFrame
aircraft_aux = pd.DataFrame(aircraft_aux)

In [34]:
df['dpt_id'] = depart_tb['iata'].fillna('') + \
                            depart_tb['scheduled'].apply(lambda x: x if pd.notnull(x) else '') + \
                            depart_tb['estimated'].apply(lambda x: x if pd.notnull(x) else '') + \
                            depart_tb['actual'].apply(lambda x: x if pd.notnull(x) else '')
df['arr_id'] = arrival_tb['iata'].fillna('') + \
                            arrival_tb['scheduled'].apply(lambda x: x if pd.notnull(x) else '') + \
                            arrival_tb['estimated'].apply(lambda x: x if pd.notnull(x) else '') + \
                            arrival_tb['actual'].apply(lambda x: x if pd.notnull(x) else '')
df['airline_id'] = airline_tb['iata']
df['flight_id'] = flight_tb['iata']
df['aircraft_id'] = aircraft_aux['iata']
df['id'] = df['flight_id'] + df['flight_date'].str.replace('-', '')

# Step 2 - Data storage


In [35]:
# column live was removed because had ~95% NULLs
df.drop(['departure','arrival','airline','flight','aircraft','live'], axis = 1).to_csv('data/flights_main.csv', index=False)
depart_tb.to_csv('data/dim_dpt.csv', index=False)
arrival_tb.to_csv('data/dim_arr.csv', index=False)
airline_tb[~airline_tb.duplicated()].to_csv('data/dim_airline.csv', index=False)
flight_tb.to_csv('data/dim_flight.csv', index=False)
aircraft_tb.to_csv('data/dim_aircraft.csv', index=False)
