In [1]:
from google.colab import drive
drive_root = '/content/drive'
drive.mount(drive_root)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
import os

project_root = os.path.join(drive_root, 'MyDrive/Colab Notebooks/cmpe540/final-project')
source_root = os.path.join(project_root, 'src')
sys.path.append(source_root)
data_folder_path = os.path.join(project_root, 'data')
raw_data_folder_path = os.path.join(data_folder_path, 'raw')
processed_data_folder_path = os.path.join(data_folder_path, 'processed')

In [3]:
zip_file_path = raw_data_folder_path + "flightprices.zip"
# !curl -L -o "{zip_file_path}" https://www.kaggle.com/api/v1/datasets/download/dilwong/flightprices

In [10]:
# import zipfile

# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extractall(processed_data_folder_path)

In [4]:
import pandas as pd

# Define flight data path
flight_data_path = os.path.join(processed_data_folder_path, 'itineraries.csv')


columns_to_keep = [
    "legId",
    "searchDate",
    "flightDate",
    "startingAirport",
    "destinationAirport",
    "isBasicEconomy",
    "isNonStop",
    "baseFare",
    "segmentsAirlineName"
]

# Specify dtypes for the columns being loaded
dtypes = {
    "legId": "object",  # String-like unique identifier
    "searchDate": "object",  # Dates can be parsed later as datetime
    "flightDate": "object",  # Dates can be parsed later as datetime
    "startingAirport": "category",  # Limited unique values
    "destinationAirport": "category",  # Limited unique values
    "isBasicEconomy": "bool",  # Boolean flag
    "isNonStop": "bool",  # Boolean flag
    "baseFare": "float32",  # Currency, floating point
    "segmentsAirlineName": "category"  # Limited unique values
}

In [5]:
from tqdm import tqdm

# Initialize an empty list to store chunks
chunks = []
chunk_size = 10**6  # Number of rows per chunk

# Read the CSV in chunks with a progress bar
with tqdm(total=os.path.getsize(flight_data_path), desc="Reading CSV") as pbar:
    for chunk in pd.read_csv(flight_data_path, usecols=columns_to_keep, dtype=dtypes, chunksize=chunk_size):
        chunks.append(chunk)
        pbar.update(chunk.memory_usage(deep=True).sum())

# Combine all chunks into a single DataFrame
flight = pd.concat(chunks, ignore_index=True)

Reading CSV:  61%|██████▏   | 19057283177/31091834438 [05:14<03:18, 60563696.21it/s]


In [6]:
# Get origin - destination pairs with most data points
top_pairs = (
    flight.groupby(['startingAirport', 'destinationAirport'])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=False)
    .head(5)
)

  flight.groupby(['startingAirport', 'destinationAirport'])


In [7]:
origin_ports = top_pairs['startingAirport'].tolist()
destination_ports = top_pairs['destinationAirport'].tolist()

# Initial filter to reduce size of dataframe
filtered_flight = flight[flight['startingAirport'].isin(origin_ports) & flight['destinationAirport'].isin(destination_ports)]

In [9]:
top_pairs['route'] = top_pairs['startingAirport'].astype(str) + '-' + top_pairs['destinationAirport'].astype(str)

In [11]:
top_routes = top_pairs['route'].tolist()

In [13]:
filtered_flight['route'] = filtered_flight['startingAirport'].astype(str) + '-' + filtered_flight['destinationAirport'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_flight['route'] = filtered_flight['startingAirport'].astype(str) + '-' + filtered_flight['destinationAirport'].astype(str)


In [15]:
filtered_flight = filtered_flight[filtered_flight['route'].isin(top_routes)]

In [19]:
# Define the output file path
output_file_path = os.path.join(processed_data_folder_path, "filtered_flight.csv")

In [21]:
# Write the filtered DataFrame to a CSV file
filtered_flight.to_csv(output_file_path, index=False)

print(f"Filtered data has been saved to: {output_file_path}")

Filtered data has been saved to: /content/drive/MyDrive/Colab Notebooks/cmpe540/final-project/data/processed/filtered_flight.csv
