# 1. Loading extracted data

In [None]:
from pydantic import BaseModel
from enum import Enum
from openai import OpenAI
from datetime import date
from keys import *
import pandas as pd

# Structured output based on the text

class FlightType(str, Enum):
    PASSENGER = 'passenger'
    OTHER = 'other'
    def __str__(self):
        return self.value

class EventLabel(str, Enum):
    CRASH = 'crash'
    ACCIDENT = 'accident'
    INCIDENT = 'incident'
    def __str__(self):
        return self.value

class CauseLabel(str, Enum):
    PILOT = 'pilot'
    CREW = 'crew'
    TECHNICAL_FAILURE = 'technical failure'
    WEATHER_CONDITION = 'weather condition'
    MISCOMMUNICATION = 'miscommunication'
    BIRD_STRIKE = 'bird strike'
    OTHER = 'other'
    def __str__(self):
        return self.value

class IncidentDetails(BaseModel):
    flight_type : FlightType
    passenger_probability_pct : int
    date : str
    type_of_event : EventLabel
    operating_carrier : str
    marketing_carrier : str
    registration : str
    plane_manufacturer : str
    plane_model : str
    year_of_manufacture : str
    from_airport : str
    from_airport_code : str
    from_city :str
    from_country :str
    to_airport : str
    to_airport_code : str
    to_city : str
    to_country : str
    incident_location : str
    onboard_passengers :int
    onboard_crews : int
    fatalities : int
    injuries_critical : int
    injuries_minor : int
    injuries_total : int
    cause : CauseLabel
    cause_probability_pct : int
    cause_detailed : str

message_content = """You are analyzing articles on aviation event based on the given criteria. Follow the below guideline to fill in IncidentDetails output

    flight_type : categorize whether the event involves scheduled passenger flight
    passenger_probability_pct : probability of the flight being the passenger flight, in percentages
    date : date of the event in YYYY-MM-DD format
    type_of_event : use ICAO's definition for distinguishing between incident and accident : Incident is 'an occurrence, other than an accident, associated with the operation of an aircraft, which affects or could affect the safety of operation.', and accident is 'an occurrence associated with the operation of an aircraft in which a person is fatally or seriously injured, or the aircraft sustains damage or structural failure, or the aircraft is missing or completely inaccessible.'
    operating_carrier & marketing_carrier: when operating carrier is different from marketing carrier that sells tickets. Look for keywords 'operated by' or 'operated for'. Otherwise, operating_carrier and marketing_carrier are same
    registration : registration code unique to the aircraft, if available
    plane_manufacturer : name of the plane's manufactuer, if available
    plane_model : model of the plane, if available
    year_of_manufacture : manufactured year of the plane, if available
    from_airport : official name of the departure airport, ending with "airport"
    from_airport_code : three letter code of the departure airport
    from_city : city of the departure airport
    from_country_code : alpha-3 code of country of the departure airport
    to_airport : official name of the destination airport, ending with "airport"
    to_airport_code : three letter code of the destination airport
    to_city : city of the destination airport
    to_country_code : alpha-3 code of country of the destination airport
    event_location : location of the event, if applicable
    onboard_passengers : number of onboard passengers, if available
    onboard_crews : number of onboard crews, if available
    fatalities : number of deaths, if available
    injuries_critical : number of critical injuries, if available
    injuries_minor : number of minor injuries, if available
    injuries_total : injuries_critical + injuries_minor
    cause : likely cause of the event, from the given information
    cause_probability_pct : likelihood of the stated cause in percentages
    cause_detailed : detailed cause of the event in one to two sentences"""


In [46]:
import os
from bs4 import BeautifulSoup, Comment
import time

client = OpenAI(api_key = ACTIVE_KEY)

dfs = []

total_processing_time = 0

for y in range(2021,2026):
    folder_path = os.path.join(os.path.os.getcwd(),'articles',str(y))
    article_list = os.listdir(folder_path)
    for a in article_list:
        article_path = os.path.join(folder_path,a)
        with open (article_path,'r', encoding='latin1') as file :
            article = file.read()
            soup = BeautifulSoup(article,'html5lib')

        # Remove unnecessary tags to reduce token input
        for tag in soup('script'):
            tag.decompose()
        for tag in soup('meta'):
            tag.decompose()
        # Remove all comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        start = time.perf_counter()
        # Connect to OpenAI API and send the input
        response = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        temperature = 0,
        store=True,
        messages=[
            {"role" : "system", "content": message_content},
            {"role" : 'user', "content" : str(soup)}
        ],
        response_format = IncidentDetails,
        )
        gpt_output = pd.DataFrame(response.choices[0].message.parsed).set_index(0).T
        end = time.perf_counter()

        total_processing_time += (end - start)

        dfs.append(gpt_output)
    print(f"Completed event categorization from {y}")

Completed event categorization from 2021
Completed event categorization from 2022
Completed event categorization from 2023
Completed event categorization from 2024
Completed event categorization from 2025


In [None]:
df = pd.concat(dfs)
df

Unnamed: 0,flight_type,passenger_probability_pct,date,type_of_event,operating_carrier,marketing_carrier,registration,plane_manufacturer,plane_model,year_of_manufacture,from_airport,from_airport_code,from_city,from_country,to_airport,to_airport_code,to_city,to_country,incident_location,onboard_passengers,onboard_crews,fatalities,injuries_critical,injuries_minor,injuries_total,cause,cause_probability_pct,cause_detailed
1,other,0,2021-03-20,accident,Trigana Air Service,Trigana Air Service,PK-YSF,Boeing,737-4Y0,1988,Jakarta-Halim Perdana Kusuma Airport,HLP,Jakarta,IDN,Makassar-Sultan Hasanuddin International Airport,UPG,Makassar,IDN,Jakarta-Halim Perdana Kusuma Airport,4,0,0,0,0,0,technical failure,80,The aircraft reported a failure of engine no.2...
1,other,0,2021-03-23,accident,SprintAir,SprintAir,SP-KPU,Saab,340AF,1989,,,,,,,,,Gdansk-Lech Walesa Airport,0,0,0,0,0,0,other,100,The accident occurred when the pilot of a Vulc...
1,other,0,2021-07-04,accident,ELISA,ELISA,UP-A0135,Antonov,An-2R,1973,,,,,,,,,"12 km from Karaultobe, Kyzylorda",2,0,0,0,0,0,technical failure,80,The aircraft experienced an engine malfunction...
1,other,0,2021-03-26,accident,West Wind Aviation Ltd.,West Wind Aviation Ltd.,5Y-NJS,Beechcraft,B200 Super King Air,1981,Eldoret Airport,HKEL,Eldoret,KEN,Nairobi-Wilson Airport,HKNW,Nairobi,KEN,"Ngong Racecourse, Nairobi",2,1,0,0,0,0,technical failure,80,The aircraft encountered severe icing conditio...
1,other,0,2021-10-05,accident,Sierra West Airlines,Sierra West Airlines,N283SA,Dassault,Falcon 20C,1967,Lubbock Preston Smith International Airport,LBB,Lubbock,USA,Thomson-McDuffie County Airport,KHQU,Thomson,USA,"1.3 km W of Thomson-McDuffie County Airport, GA",2,0,2,0,0,0,pilot,90,The accident was caused by the flight crew's c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,passenger,100,2025-02-05,accident,Delta Air Lines,Delta Air Lines,N3737C,Boeing,737-832 (WL),2000,Seattle/Tacoma International Airport,SEA,Seattle,USA,Puerto Vallarta-Gustavo D. Ordaz Airport,PVR,Puerto Vallarta,MEX,"Seattle/Tacoma International Airport, WA",0,0,0,0,0,0,other,80,The incident involved a ground contact where t...
1,passenger,100,2025-02-01,accident,Air Wisconsin,American Eagle,N420AW,Bombardier,CRJ-200LR,2002,Kalamazoo-Battle Creek International Airport,AZO,Kalamazoo,USA,Chicago O'Hare International Airport,ORD,Chicago,USA,Chicago O'Hare International Airport,0,0,0,1,0,1,other,90,The incident was caused by a tug colliding wit...
1,other,0,2025-01-09,accident,Private,Private,PR-GFS,Cessna,525 CitationJet CJ1+,2008,Mineiros Airport,SWME,Mineiros,BRA,Ubatuba Airport,UBT,Ubatuba,BRA,"Ubatuba Airport, SP",5,0,1,0,3,3,other,100,The aircraft overshot the runway during landin...
1,passenger,100,2025-01-07,accident,All Nippon Airways - ANA,All Nippon Airways - ANA,JA892A,Boeing,787-9 Dreamliner,2017,Tokyo International Airport/Haneda,HND,Tokyo,JPN,Los Angeles International Airport,LAX,Los Angeles,USA,100 km northwest of Los Angeles International ...,196,0,0,1,4,5,other,80,The event was caused by turbulence encountered...


In [52]:
print(f"Total processing time for Chat GPT API usage : {round(total_processing_time / 60,1)} min, or {round(total_processing_time/df.shape[0],1)} seconds per document")

Total processing time for Chat GPT API usage : 77.8 min, or 5.3 seconds per document


In [54]:
df.to_csv('aviation_events.csv')

# Filtering scheduled passenger flights