# Query and Join Data
This notebook demonstrates how to query and join data from the models to resemble an imported CSV file. Missing data will be represented as NaN.

In [None]:
from models import Airline, Airport, Airplane, Flight, FlightDetails
from sqlalchemy.orm import sessionmaker, aliased
from sqlalchemy import create_engine
import pandas as pd

# Set up the database connection
engine = create_engine("sqlite:///flights.db")
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
# Alias the Airport table for departure and arrival airports
DepartureAirport = aliased(Airport)
ArrivalAirport = aliased(Airport)

# Query and join data using SQLAlchemy ORM with outer joins
query = (
    session.query(
        Flight.id.label("flight_id"),
        Flight.status,
        FlightDetails.flight_number,
        FlightDetails.call_sign,
        Airline.airline_name,
        Airline.airline_iata,
        Airline.airline_icao,
        Airplane.aircraft_model,
        Airplane.aircraft_reg,
        DepartureAirport.airport_name.label("departure_airport_name"),
        DepartureAirport.airport_iata.label("departure_airport_iata"),
        DepartureAirport.airport_icao.label("departure_airport_icao"),
        DepartureAirport.timezone.label("departure_timezone"),
        ArrivalAirport.airport_name.label("arrival_airport_name"),
        ArrivalAirport.airport_iata.label("arrival_airport_iata"),
        ArrivalAirport.airport_icao.label("arrival_airport_icao"),
        ArrivalAirport.timezone.label("arrival_timezone"),
        Flight.dep_date_time_UTC.label("departure_datetime"),
        Flight.dep_rev_date_time_UTC.label("departure_datetime_revised"),
        Flight.arr_date_time_UTC.label("arrival_datetime"),
        Flight.arr_rev_date_time_UTC.label("arrival_datetime_revised"),
    )
    .outerjoin(FlightDetails, Flight.flight_details_id == FlightDetails.id)
    .outerjoin(Airline, Flight.arline_id == Airline.id)  # Corrected typo
    .outerjoin(Airplane, Flight.airplane_id == Airplane.id)
    .outerjoin(DepartureAirport, Flight.dep_airport_id == DepartureAirport.id)
    .outerjoin(ArrivalAirport, Flight.arr_airport_id == ArrivalAirport.id)
)

# Convert the query result to a Pandas DataFrame
joined_data = pd.DataFrame(query.all(), columns=[
    "flight_id", "status", "flight_number", "call_sign", "airline_name", 
    "airline_iata", "airline_icao", "aircraft_model", "aircraft_reg", 
    "departure_airport_name", "departure_airport_iata", "departure_airport_icao", 
    "departure_timezone", "arrival_airport_name", "arrival_airport_iata", 
    "arrival_airport_icao", "arrival_timezone", "departure_datetime", "departure_datetime_revised", 
    "arrival_datetime", "arrival_datetime_revised"
])

In [None]:
joined_data.head()

In [None]:
# Select specific columns to keep
columns_to_keep = [
    "flight_number",
    "call_sign",
    "status",
    "airline_name",
    "airline_iata",
    "airline_icao",
    "aircraft_reg",
    "aircraft_model",
    "departure_airport_name",
    "departure_airport_iata",
    "departure_airport_icao",
    "departure_timezone",
    "departure_datetime",
    "departure_datetime_revised",
    "arrival_airport_name",
    "arrival_airport_iata",
    "arrival_airport_icao",
    "arrival_timezone",
    "arrival_datetime",
    "arrival_datetime_revised",
]

cleaned_data = joined_data[columns_to_keep]

In [None]:
cleaned_data.head()

In [None]:
cleaned_data.isnull().sum()

In [None]:
cleaned_data.to_csv("csv_retrieved/cleaned_flight_data.csv", index=False)