In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv('./flights.csv')

# Drop rows where sched_dep_time or sched_arr_time is missing
df = df.dropna(subset=['sched_dep_time', 'sched_arr_time'])

# Ensure times are integers
df['sched_dep_time'] = df['sched_dep_time'].astype(int)
df['sched_arr_time'] = df['sched_arr_time'].astype(int)

# Number of days in each month (for 2013)
days_in_month = {
    1: 31,
    2: 28,
    3: 31,
    4: 30,
    5: 31,
    6: 30,
    7: 31,
    8: 31,
    9: 30,
    10: 31,
    11: 30,
    12: 31
}

# Precompute cumulative days for faster processing
cumulative_days = {1: 0}
for month in range(2, 13):
    cumulative_days[month] = cumulative_days[month - 1] + days_in_month[month - 1]

# Function to adjust time based on cumulative hours
def adjust_to_global_time(row):
    # Calculate the total number of days elapsed before this date
    base_days = cumulative_days[row['month']] + (row['day'] - 1)
    base_hours = base_days * 24

    # Extract hours from sched_dep_time and sched_arr_time (ignore minutes)
    dep_hour = row['sched_dep_time'] // 100
    arr_hour = row['sched_arr_time'] // 100

    # Add base_hours to the hour component
    dep_global_hour = base_hours + dep_hour
    arr_global_hour = base_hours + arr_hour

    # Recalculate adjusted times in hours-only format
    row['dep_time'] = dep_global_hour
    row['arr_time'] = arr_global_hour

    return row

# Apply the adjustment function to the dataset
df = df.apply(adjust_to_global_time, axis=1)

# Select final columns
processed_df = df[['month', 'day', 'origin', 'dest', 'dep_time', 'arr_time']]
processed_df.rename(columns={'origin': 'depart_airport', 'dest': 'arrival_airport'}, inplace=True)

# Display the processed data before saving to CSV
print("Processed Data:")
print(processed_df.head(50))

# Save the processed dataset
processed_df.to_csv('./processed_flights.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df.rename(columns={'origin': 'depart_airport', 'dest': 'arrival_airport'}, inplace=True)


Processed Data:
    month  day depart_airport arrival_airport  dep_time  arr_time
0       1    1            EWR             IAH         5         8
1       1    1            LGA             IAH         5         8
2       1    1            JFK             MIA         5         8
3       1    1            JFK             BQN         5        10
4       1    1            LGA             ATL         6         8
5       1    1            EWR             ORD         5         7
6       1    1            EWR             FLL         6         8
7       1    1            LGA             IAD         6         7
8       1    1            JFK             MCO         6         8
9       1    1            LGA             ORD         6         7
10      1    1            JFK             PBI         6         8
11      1    1            JFK             TPA         6         8
12      1    1            JFK             LAX         6         9
13      1    1            EWR             SFO         6     

In [6]:
import pandas as pd
import os

# Load the processed flights data
processed_df = pd.read_csv('./processed_flights.csv')

# Step 1: Filter for January data
january_df = processed_df[processed_df['month'] == 1]

# Ensure necessary columns are present
january_df = january_df[['month', 'day', 'depart_airport', 'arrival_airport', 'dep_time', 'arr_time']]

# Step 2: Save January data to a "january" folder
os.makedirs('./january', exist_ok=True)
january_df.to_csv('./january/january_flights.csv', index=False)

# Print completion message
print("January CSV saved to './january/january_flights.csv'")


January CSV saved to './january/january_flights.csv'


In [8]:
import pandas as pd

# Load the January flights data
january_df = pd.read_csv('./january/january_flights.csv')

# Ensure times are integers
january_df['dep_time'] = january_df['dep_time'].astype(int)
january_df['arr_time'] = january_df['arr_time'].astype(int)

# Step 1: Calculate the earliest departure time
earliest_hour = january_df['dep_time'].min()

# Step 2: Count flights for the earliest hour
flight_count = (january_df['dep_time'] == earliest_hour).sum()

# Step 3: Estimate starting planes (using a multiplier factor, e.g., 1.5)
plane_multiplier = 1.5
estimated_planes = int(flight_count * plane_multiplier)

# Step 4: Process airports and flights
# List of unique airports
airports = sorted(set(january_df['depart_airport']).union(set(january_df['arrival_airport'])))

# Format flights for output
flights_info = []
for _, row in january_df.iterrows():
    flight_info = f"{row['depart_airport']} {row['arrival_airport']} {row['dep_time']} {row['arr_time']}"
    flights_info.append(flight_info)

# Step 5: Output to a TXT file
output_file = './january/january_flights_info.txt'
with open(output_file, 'w') as f:
    # Write the first line: # planes, # airports
    f.write(f"{estimated_planes} {len(airports)}\n")

    # Write each flight info
    for flight in flights_info:
        f.write(f"{flight}\n")

# Print completion message
print(f"Processed data saved to: {output_file}")


Processed data saved to: ./january/january_flights_info.txt
