In [61]:
from google.colab import drive
drive_root = '/content/drive'
drive.mount(drive_root, force_remount=True)

Mounted at /content/drive


In [62]:
import sys
import os

project_root = os.path.join(drive_root, 'MyDrive/Colab Notebooks/cmpe540/final-project')
source_root = os.path.join(project_root, 'src')
sys.path.append(source_root)
data_folder_path = os.path.join(project_root, 'data')
raw_data_folder_path = os.path.join(data_folder_path, 'raw')
processed_data_folder_path = os.path.join(data_folder_path, 'processed')
# output_file_path = os.path.join(processed_data_folder_path, "train_data.csv")

In [63]:
import pandas as pd
input_file_path = os.path.join(processed_data_folder_path, "new_filtered_flight.csv")
flight = pd.read_csv(input_file_path)

In [64]:
# Filter for busiest route
flight_filtered = flight.copy()
flight_filtered = flight_filtered[flight_filtered['route'] == 'ATL-LAX']
flight_filtered = flight_filtered.drop(['route'], axis=1)

# Change date columns to datetime
for column in ['searchDate', 'flightDate']:
  flight_filtered[column] = pd.to_datetime(flight_filtered[column], format='%Y-%m-%d')

In [65]:
# Calculate Number (of) Days (to) Operation = NDO
flight_filtered['ndo'] = flight_filtered['flightDate'] - flight_filtered['searchDate']
flight_filtered = flight_filtered.drop(['searchDate'], axis=1)

In [66]:
# # Remove rows which have ndo over 30, out-of-scope of this study
# flight_filtered = flight_filtered[flight_filtered['ndo'] <= pd.Timedelta(days=30)]

In [67]:
# Arrival time creation
flight_filtered['arrivalEpoch'] = flight_filtered['segmentsArrivalTimeEpochSeconds'].apply(
    lambda x: int(x.split('||')[0])  # Convert to int after splitting
)
flight_filtered = flight_filtered.drop(['segmentsArrivalTimeEpochSeconds'], axis=1)

# Convert epoch to datetime
flight_filtered['arrivalDatetime'] = pd.to_datetime(flight_filtered['arrivalEpoch'], unit='s')
flight_filtered = flight_filtered.drop(['arrivalEpoch'], axis=1)

In [68]:
# Extract time (0.00, 24.00):24h from full date
flight_filtered['arrivalTimeFloat'] = flight_filtered['arrivalDatetime'].dt.hour + (flight_filtered['arrivalDatetime'].dt.minute / 60)
flight_filtered['arrivalTimeFloat'] = flight_filtered['arrivalTimeFloat'].round(2)
flight_filtered = flight_filtered.drop(['arrivalDatetime'], axis=1)

In [69]:
# Departure
flight_filtered['departureEpoch'] = flight_filtered['segmentsDepartureTimeEpochSeconds'].apply(
    lambda x: int(x.split('||')[0])  # Convert to int after splitting
)
flight_filtered = flight_filtered.drop(['segmentsDepartureTimeEpochSeconds'], axis=1)

# Convert epoch to datetime
flight_filtered['departureDatetime'] = pd.to_datetime(flight_filtered['departureEpoch'], unit='s')
flight_filtered = flight_filtered.drop(['departureEpoch'], axis=1)

# Extract "day-of-week" (1 for Monday, 7 for Sunday)
flight_filtered['dayOfWeek'] = flight_filtered['departureDatetime'].dt.isocalendar().day

# Extract "departure-time" as hours (0.00 to 24.00 format)
flight_filtered['departureTimeFloat'] = flight_filtered['departureDatetime'].dt.hour + (flight_filtered['departureDatetime'].dt.minute / 60)
flight_filtered['departureTimeFloat'] = flight_filtered['departureTimeFloat'].round(2)

# Remove rows where (possibly actual) departureDatetime does not match flightDate
# This is a workaround since this was the only available timestamped data, but for
# the scheduled times, it does not contain timestamp information
flight_filtered['temp_flightDate'] = flight_filtered['departureDatetime'].dt.date
flight_filtered[flight_filtered['temp_flightDate'] != flight_filtered['flightDate']]
flight_filtered = flight_filtered.drop(['departureDatetime', 'temp_flightDate'], axis=1)

In [70]:
# Create 'Departure date' feature: A certain date in the days investigated
flight_filtered = flight_filtered.sort_values(by=['flightDate', 'departureTimeFloat'], ascending=[True, True])

flight_filtered['NDepartureDate'] = (
    flight_filtered['flightDate']
    .rank(method='dense', ascending=True)
    .astype(int)
)

In [71]:
# P_i_j = P_flightDate_ndo
# Aggregate average fare for a given departure date
lut = flight_filtered.groupby(['NDepartureDate', 'ndo'])['baseFare'].mean().reset_index()
lut['baseFare'] = lut['baseFare'].round(2)

# Convert ndo to int for easy coordinate-based (i,j) access
lut['ndo'] = lut['ndo'].dt.days

In [72]:
# Pivot to create lut_matrix
lut_matrix = lut.pivot(
    index='NDepartureDate',  # Use NDepartureDate as the row index
    columns='ndo',           # Use ndo as the columns
    values='baseFare'        # Use baseFare for values
)

In [73]:
# Find the days we have ndo30 - ndo1 data of
unique_ndo_per_date = flight_filtered.groupby('flightDate')['ndo'].nunique().reset_index()
unique_ndo_per_date.rename(columns={'ndo': 'uniqueNdoValues'}, inplace=True)
eligible_dates = unique_ndo_per_date[unique_ndo_per_date['uniqueNdoValues'] >= 30]['flightDate'].tolist()
flight_filtered = flight_filtered[flight_filtered['flightDate'].isin(eligible_dates)]

In [74]:
import numpy as np

# Function to compute b1 to b30 for a given i and j
def compute_b_values(i, j, lut_matrix, max_b=30):
    b_values = []
    if i - 1 in lut_matrix.index:  # Check if previous row exists
        row_prev = lut_matrix.loc[i - 1]
        b_values += row_prev.iloc[max(0, j - max_b):j].tolist()

    if len(b_values) < max_b and i in lut_matrix.index:  # Fill remaining from current row
        row_curr = lut_matrix.loc[i]
        b_values += row_curr.iloc[:max_b - len(b_values)].tolist()

    # Pad with NaN if not enough values
    return b_values[:max_b] + [np.nan] * (max_b - len(b_values))

In [75]:
flight_filtered = flight_filtered[(flight_filtered['departureTimeFloat'] > 18.99) & (flight_filtered['departureTimeFloat'] < 21.00)]

# Add b1 to b30 as new columns in `flight_filtered`
flight_filtered['ndo'] = flight_filtered['ndo'].dt.days

# i : Departure date, j: ndo
for idx, row in flight_filtered.iterrows():
    i = row['NDepartureDate']
    j = row['ndo']
    b_values = compute_b_values(i, j, lut_matrix)
    for b_idx, b_val in enumerate(b_values, start=1):
        flight_filtered.at[idx, f'b{b_idx}'] = b_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flight_filtered['ndo'] = flight_filtered['ndo'].dt.days


In [55]:
# flight_filtered = flight_filtered[~flight_filtered.isna().any(axis=1)]

In [76]:
# 1. Departure Date Time Series
departure_date_series = flight_filtered.groupby('flightDate')['baseFare'].mean() # Or sum, median, etc. depending on what you want to aggregate
departure_date_series = departure_date_series.resample('D').asfreq() # Important: Resample to daily frequency to have consistent time steps. Fills missing dates with NaN
departure_date_series = departure_date_series.ffill() # Forward fill missing values.
departure_date_series = departure_date_series.bfill() # Back fill missing values.

In [77]:
# 2. Days-to-Departure Time Series
# Sort by ndo (days to departure) in descending order (far to near) WITHIN each flight date
flight_filtered = flight_filtered.sort_values(by=['flightDate', 'ndo'], ascending=[True, False])

In [83]:
# **Handle duplicates within each flight date:**
flight_filtered = flight_filtered.groupby(['flightDate', 'ndo']).agg({
  'baseFare': 'mean'
}).reset_index()

In [84]:
# Create a list to hold all the individual days-to-departure series
days_to_departure_series_list = []

In [85]:
for date, group in flight_filtered.groupby('flightDate'):
    days_to_departure_series = group.set_index('ndo')['baseFare']
    # Pad the series to a consistent length. This is crucial for feeding into neural networks.
    max_ndo = flight_filtered['ndo'].max()
    padded_series = days_to_departure_series.reindex(np.arange(max_ndo, -1, -1), fill_value=np.nan) # Pad from max ndo to 0
    padded_series = padded_series.ffill() # Forward fill missing values.
    padded_series = padded_series.bfill() # Back fill missing values.
    days_to_departure_series_list.append(padded_series)

In [86]:
# Convert the list of series to a NumPy array for easier use in neural networks
days_to_departure_matrix = np.array(days_to_departure_series_list)

In [88]:
print("Departure Date Time Series:\n", departure_date_series)
print("\nDays-to-Departure Matrix (each row is a flight):\n", days_to_departure_matrix)

#Shape of the data
print("Shape of Departure Date Time Series:\n", departure_date_series.shape)
print("Shape of Days to Departure Matrix:\n", days_to_departure_matrix.shape)

Departure Date Time Series:
 flightDate
2022-05-19    432.635752
2022-05-20    441.740099
2022-05-21    381.004267
2022-05-22    501.336250
2022-05-23    458.435789
                 ...    
2022-10-18    289.848596
2022-10-19    284.846225
2022-10-20    252.153653
2022-10-21    268.564344
2022-10-22    295.425493
Freq: D, Name: baseFare, Length: 157, dtype: float64

Days-to-Departure Matrix (each row is a flight):
 [[362.976      362.976      362.976      ... 513.768      519.07
  519.07      ]
 [335.195      335.195      335.195      ... 582.725      622.976
  622.976     ]
 [324.03333333 324.03333333 324.03333333 ... 447.27333333 419.19636364
  419.19636364]
 ...
 [174.53       174.53       174.53       ... 292.25       292.25
  292.25      ]
 [224.84       224.84       224.84       ... 313.01461538 313.01461538
  313.01461538]
 [294.58333333 294.58333333 294.58333333 ... 230.465      230.465
  230.465     ]]
Shape of Departure Date Time Series:
 (157,)
Shape of Days to Departure Mat

In [89]:
# Directory to save the data
output_dir = os.path.join(processed_data_folder_path, "model_data")
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# 1. Save Departure Date Time Series
departure_date_series.to_csv(os.path.join(output_dir, "departure_date_series.csv"), header=True)

# 2. Save Days-to-Departure Matrix
np.save(os.path.join(output_dir, "days_to_departure_matrix.npy"), days_to_departure_matrix)

print(f"Data saved to {output_dir}")

Data saved to /content/drive/MyDrive/Colab Notebooks/cmpe540/final-project/data/processed/model_data
