In [15]:
import os
import re
import json
import pandas as pd
from bs4 import BeautifulSoup

'''
The data source is this website: https://www.flightsfrom.com/SFO/departures?airlines=UA&durationFrom=55&durationTo=1080&dateMethod=day&dateFrom=2024-05-31&dateTo=2024-05-31

I was having some issues with scraping so I decided to just copy the source file. Since the source file could only load 50 flights, I did this 5 times and saved the results.
I extracted the data from all 5 source files and removed all duplicates (these are in the dataset because I cuold only set the time filter on the website in 5 minute intervals)
'''

all_flight_data = []

directory = '.'

for filename in os.listdir(directory):
    if filename.startswith("source") and filename.endswith(".html"):
        filepath = os.path.join(directory, filename)
        
        with open(filepath, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        soup = BeautifulSoup(html_content, 'html.parser')

        script_tags = soup.find_all('script')
        script_content = None

        for script in script_tags:
            if script.string and 'window.shedules' in script.string:
                script_content = script.string
                break

        if script_content:
            print(f"Found the script tag containing window.shedules in file {filename}.")
        else:
            print(f"Could not find the script tag containing window.shedules in file {filename}.")
            continue

        pattern = re.compile(r'window\.shedules\s*=\s*({.*});', re.DOTALL)
        match = pattern.search(script_content)
        if match:
            data = match.group(1)
            schedules = json.loads(data)
        else:
            print(f"Could not extract JSON data from the script tag in file {filename}.")
            continue

        for flight in schedules['result']:
            flight_info = {
                "clean_date": flight["clean_date"],
                "carrier": flight["carrier"],
                "flightnumber": flight["flightnumber"],
                "airport_name": flight["airport"]["name"],
                "airport_city_name": flight["airport"]["city_name"],
                "airport_IATA": flight["airport"]["IATA"],
                "aircraft_name": flight["aircraft"]["name"],
                "aircraft_IATA": flight["aircraft"]["IATA"],
                "departure_time": flight["departure_time"],
                "arrival_time": flight["arrival_time"],
                "elapsed_time": flight["elapsed_time"],
                "codeshare_info": flight.get("codeshare_info", "")
            }
            all_flight_data.append(flight_info)

df = pd.DataFrame(all_flight_data)

df.drop_duplicates(inplace=True)
print(df.info())
df.to_csv('future_flights_data.csv', index=False)


Found the script tag containing window.shedules in file source_1038.html.
Found the script tag containing window.shedules in file source_1825.html.
Found the script tag containing window.shedules in file source_2320.html.
Found the script tag containing window.shedules in file source_1310.html.
Found the script tag containing window.shedules in file source_2400.html.
<class 'pandas.core.frame.DataFrame'>
Index: 203 entries, 0 to 210
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   clean_date         203 non-null    object
 1   carrier            203 non-null    object
 2   flightnumber       203 non-null    object
 3   airport_name       203 non-null    object
 4   airport_city_name  203 non-null    object
 5   airport_IATA       203 non-null    object
 6   aircraft_name      203 non-null    object
 7   aircraft_IATA      203 non-null    object
 8   departure_time     203 non-null    object
 9   arrival_

In [16]:
df

Unnamed: 0,clean_date,carrier,flightnumber,airport_name,airport_city_name,airport_IATA,aircraft_name,aircraft_IATA,departure_time,arrival_time,elapsed_time,codeshare_info
0,2024-05-31,UA,512,George Bush Intcntl Houston,Houston,IAH,Boeing 737MAX 9 Passenger,7M9,00:30,06:26:00,236,NZ 9190 /VA 8082
1,2024-05-31,UA,189,Ninoy Aquino International,Manila,MNL,Boeing 777-300ER,77W,00:45,05:55:00,850,
2,2024-05-31,UA,1139,Chicago Ohare International,Chicago,ORD,Boeing 737-900 Passenger,739,01:10,07:30:00,260,AC 3088 /NZ 9308
3,2024-05-31,UA,274,George Bush Intcntl Houston,Houston,IAH,Boeing 737MAX 9 Passenger,7M9,05:00,11:03:00,243,NZ 9198 /VA 8084
4,2024-05-31,UA,1003,Denver International,Denver,DEN,Boeing 757-300 Passenger,753,05:00,08:37:00,157,AC 4046 /NZ 9057 /VA 8335
...,...,...,...,...,...,...,...,...,...,...,...,...
206,2024-05-31,UA,853,Taoyuan International Airport,Taipei,TPE,Boeing 777,777,23:45,04:05:00,800,
207,2024-05-31,UA,877,Hong Kong International Airport,Hong Kong,HKG,Boeing 777-300ER,77W,23:50,05:00:00,850,
208,2024-05-31,UA,805,Incheon Intl,Seoul,ICN,Boeing 777,777,23:55,04:20:00,745,OZ 6621
209,2024-05-31,UA,505,George Bush Intcntl Houston,Houston,IAH,Boeing 777,777,23:59,05:54:00,235,NZ 9194 /VA 8081
