# Read Data

In [1]:
import pandas as pd

# Load the CSV file
file_path = "flights_sample_3m.csv"  # Update this to the actual path if needed
df = pd.read_csv(file_path)

# Display the column names
print("Columns in the dataset:")
print(df.columns)

Columns in the dataset:
Index(['FL_DATE', 'AIRLINE', 'AIRLINE_DOT', 'AIRLINE_CODE', 'DOT_CODE',
       'FL_NUMBER', 'ORIGIN', 'ORIGIN_CITY', 'DEST', 'DEST_CITY',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'DELAY_DUE_CARRIER',
       'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY',
       'DELAY_DUE_LATE_AIRCRAFT'],
      dtype='object')


In [3]:
df.head()

Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,...,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-01-09,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,1562,FLL,"Fort Lauderdale, FL",EWR,"Newark, NJ",...,0.0,186.0,176.0,153.0,1065.0,,,,,
1,2022-11-19,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,1149,MSP,"Minneapolis, MN",SEA,"Seattle, WA",...,0.0,235.0,236.0,189.0,1399.0,,,,,
2,2022-07-22,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,459,DEN,"Denver, CO",MSP,"Minneapolis, MN",...,0.0,118.0,112.0,87.0,680.0,,,,,
3,2023-03-06,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,2295,MSP,"Minneapolis, MN",SFO,"San Francisco, CA",...,0.0,260.0,285.0,249.0,1589.0,0.0,0.0,24.0,0.0,0.0
4,2020-02-23,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,407,MCO,"Orlando, FL",DFW,"Dallas/Fort Worth, TX",...,0.0,181.0,182.0,153.0,985.0,,,,,


# Data Dictionary

In [2]:
'''

1. **FL_DATE**: Flight date, showing when the flight was scheduled or took place.
2. **AIRLINE**: The name of the airline operating the flight.
3. **AIRLINE_DOT**: A more descriptive name of the airline as recognized by the Department of Transportation (DOT).
4. **AIRLINE_CODE**: The IATA code (two-letter code) for the airline.
5. **DOT_CODE**: The unique Department of Transportation (DOT) code for the airline.
6. **FL_NUMBER**: Flight number assigned to this particular flight by the airline.
7. **ORIGIN**: The IATA code for the origin (departure) airport.
8. **ORIGIN_CITY**: The city where the origin (departure) airport is located.
9. **DEST**: The IATA code for the destination (arrival) airport.
10. **DEST_CITY**: The city where the destination (arrival) airport is located.
11. **CRS_DEP_TIME**: Scheduled departure time (in local time).
12. **DEP_TIME**: Actual departure time (in local time).
13. **DEP_DELAY**: Departure delay in minutes. Negative values indicate early departures.
14. **TAXI_OUT**: Time (in minutes) spent taxiing from the gate to the runway before takeoff.
15. **WHEELS_OFF**: The time (in local time) when the aircraft actually took off.
16. **WHEELS_ON**: The time (in local time) when the aircraft touched down at the destination airport.
17. **TAXI_IN**: Time (in minutes) spent taxiing from the runway to the gate after landing.
18. **CRS_ARR_TIME**: Scheduled arrival time (in local time).
19. **ARR_TIME**: Actual arrival time (in local time).
20. **ARR_DELAY**: Arrival delay in minutes. Negative values indicate early arrivals.
21. **CANCELLED**: Indicates if the flight was canceled (1 if canceled, 0 if not).
22. **CANCELLATION_CODE**: The reason for cancellation, if applicable.
    - A: Carrier
    - B: Weather
    - C: National Air System
    - D: Security
23. **DIVERTED**: Indicates if the flight was diverted to a different airport (1 if diverted, 0 if not).
24. **CRS_ELAPSED_TIME**: Scheduled duration of the flight, from departure to arrival (in minutes).
25. **ELAPSED_TIME**: Actual duration of the flight, from departure to arrival (in minutes).
26. **AIR_TIME**: Time spent in the air, from takeoff to landing (in minutes).
27. **DISTANCE**: Distance of the flight in miles.
28. **DELAY_DUE_CARRIER**: Delay caused by the airline carrier (in minutes).
29. **DELAY_DUE_WEATHER**: Delay caused by weather conditions (in minutes).
30. **DELAY_DUE_NAS**: Delay caused by the National Airspace System (NAS), including air traffic control issues (in minutes).
31. **DELAY_DUE_SECURITY**: Delay caused by security issues (in minutes).
32. **DELAY_DUE_LATE_AIRCRAFT**: Delay caused by a late-arriving aircraft that impacted this flight’s schedule (in minutes).

'''

'\n\n1. **FL_DATE**: Flight date, showing when the flight was scheduled or took place.\n2. **AIRLINE**: The name of the airline operating the flight.\n3. **AIRLINE_DOT**: A more descriptive name of the airline as recognized by the Department of Transportation (DOT).\n4. **AIRLINE_CODE**: The IATA code (two-letter code) for the airline.\n5. **DOT_CODE**: The unique Department of Transportation (DOT) code for the airline.\n6. **FL_NUMBER**: Flight number assigned to this particular flight by the airline.\n7. **ORIGIN**: The IATA code for the origin (departure) airport.\n8. **ORIGIN_CITY**: The city where the origin (departure) airport is located.\n9. **DEST**: The IATA code for the destination (arrival) airport.\n10. **DEST_CITY**: The city where the destination (arrival) airport is located.\n11. **CRS_DEP_TIME**: Scheduled departure time (in local time).\n12. **DEP_TIME**: Actual departure time (in local time).\n13. **DEP_DELAY**: Departure delay in minutes. Negative values indicate ear

# Feature Engineering

1. **Month (extracted from FL_DATE)**:
   - **Reason**: Flight delays can exhibit seasonal trends due to factors like holiday travel surges, weather variations (e.g., winter storms), and seasonal flight schedules. By including the month, the model can capture these seasonal patterns, which can impact delay likelihood.

2. **AIRLINE_CODE**:
   - **Reason**: Different airlines have varying operational practices, policies, and resources, which can affect their on-time performance. Some airlines may have stricter schedules, while others might be more affected by operational delays. The airline code helps capture these airline-specific characteristics.

3. **ORIGIN (Origin Airport Code)**:
   - **Reason**: Some airports are more prone to delays due to congestion, local weather patterns, or geographic location. For example, airports in high-traffic cities or regions with frequent bad weather (like New York or Denver) might experience more delays. Including the origin airport helps account for these factors.

4. **DEST (Destination Airport Code)**:
   - **Reason**: Similar to the origin, the destination airport can influence delays due to factors like local congestion or weather. By including the destination code, the model can recognize patterns related to specific airports that might impact flight arrival times.

5. **CRS_DEP_TIME (Scheduled Departure Time)**:
   - **Reason**: Delays often correlate with peak travel times, such as early morning or late evening hours. Including the scheduled departure time helps the model recognize if certain times of the day are more delay-prone, as flights scheduled during busy periods may face higher delays due to airport congestion or air traffic control limitations.

6. **CRS_ELAPSED_TIME (Scheduled Flight Duration)**:
   - **Reason**: Longer flights may have a different risk profile for delays compared to shorter flights. For instance, longer flights might be more susceptible to in-flight rerouting or weather conditions en route. Scheduled flight duration provides context about the length of the journey, which can influence delay likelihood.

7. **DISTANCE**:
   - **Reason**: The distance of the flight can impact delay patterns, as longer flights may have different types of operational challenges. For instance, longer flights might be subject to stricter planning or affected by more complex air traffic control patterns. Including distance allows the model to distinguish between delays on short-haul versus long-haul flights.

### Summary
These features were chosen as they capture a broad range of factors that can contribute to flight delays:
- **Temporal trends** (Month),
- **Airline-specific patterns** (AIRLINE_CODE),
- **Airport-specific conditions** (ORIGIN, DEST),
- **Time of day** (CRS_DEP_TIME),
- **Flight characteristics** (CRS_ELAPSED_TIME and DISTANCE).

Together, these predictors give the model context on seasonal, operational, spatial, and scheduling factors that are likely to influence flight delays.

In [5]:
import pandas as pd

# Load the data
df = pd.read_csv('flights_sample_3m.csv')

# 1. Extract the month from FL_DATE
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], errors='coerce')  # Ensure date format
df['Month'] = df['FL_DATE'].dt.month  # Extract month as a new column

# 2. Select the necessary columns
selected_columns = [
    'Month', 'AIRLINE_CODE', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',
    'CRS_ELAPSED_TIME', 'DISTANCE', 'DEP_DELAY'
]
df_selected = df[selected_columns]

# 3. One-hot encode categorical columns (AIRLINE_CODE, ORIGIN, DEST)
df_encoded = pd.get_dummies(df_selected, columns=['AIRLINE_CODE', 'ORIGIN', 'DEST'], drop_first=True)

# 4. Save the feature-engineered dataset
df_encoded.to_csv('flight_feature_engineered.csv', index=False)

# Print the resulting columns and data types to confirm
print(df_encoded.dtypes)


Month                 int32
CRS_DEP_TIME          int64
CRS_ELAPSED_TIME    float64
DISTANCE            float64
DEP_DELAY           float64
                     ...   
DEST_XNA               bool
DEST_XWA               bool
DEST_YAK               bool
DEST_YKM               bool
DEST_YUM               bool
Length: 780, dtype: object


In [8]:
# Print each column and its data type
for col, dtype in df_encoded.dtypes.items():
    print(f"Column: {col} | Data Type: {dtype}")

Column: Month | Data Type: int32
Column: CRS_DEP_TIME | Data Type: int64
Column: CRS_ELAPSED_TIME | Data Type: float64
Column: DISTANCE | Data Type: float64
Column: DEP_DELAY | Data Type: float64
Column: AIRLINE_CODE_AA | Data Type: bool
Column: AIRLINE_CODE_AS | Data Type: bool
Column: AIRLINE_CODE_B6 | Data Type: bool
Column: AIRLINE_CODE_DL | Data Type: bool
Column: AIRLINE_CODE_EV | Data Type: bool
Column: AIRLINE_CODE_F9 | Data Type: bool
Column: AIRLINE_CODE_G4 | Data Type: bool
Column: AIRLINE_CODE_HA | Data Type: bool
Column: AIRLINE_CODE_MQ | Data Type: bool
Column: AIRLINE_CODE_NK | Data Type: bool
Column: AIRLINE_CODE_OH | Data Type: bool
Column: AIRLINE_CODE_OO | Data Type: bool
Column: AIRLINE_CODE_QX | Data Type: bool
Column: AIRLINE_CODE_UA | Data Type: bool
Column: AIRLINE_CODE_WN | Data Type: bool
Column: AIRLINE_CODE_YV | Data Type: bool
Column: AIRLINE_CODE_YX | Data Type: bool
Column: ORIGIN_ABI | Data Type: bool
Column: ORIGIN_ABQ | Data Type: bool
Column: ORIGIN_A

In [9]:
df_encoded.head()

Unnamed: 0,Month,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DEP_DELAY,AIRLINE_CODE_AA,AIRLINE_CODE_AS,AIRLINE_CODE_B6,AIRLINE_CODE_DL,AIRLINE_CODE_EV,...,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_WYS,DEST_XNA,DEST_XWA,DEST_YAK,DEST_YKM,DEST_YUM
0,1,1155,186.0,1065.0,-4.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,11,2120,235.0,1399.0,-6.0,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,7,954,118.0,680.0,6.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,1609,260.0,1589.0,-1.0,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,2,1840,181.0,985.0,-2.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Model Training

In [1]:
import pandas as pd

# Load your dataset
df_encoded = pd.read_csv("flight_feature_engineered.csv")  

In [2]:
X = df_encoded.drop(columns=['DEP_DELAY'])  # Features
y = df_encoded['DEP_DELAY']                 # Target

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
import numpy as np

# Impute NaN values in y_train with the mean of y_train
y_train = np.where(np.isnan(y_train), np.nanmean(y_train), y_train)

# Verify there are no more NaN values
print("Number of NaN values in y_train after imputation:", np.isnan(y_train).sum())

Number of NaN values in y_train after imputation: 0


In [5]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [6]:
import joblib

# Save the trained model to your specified path
joblib.dump(rf_model, 'flight_random_forest_model.pkl')

['flight_random_forest_model.pkl']

In [7]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = rf_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)

ValueError: Input contains NaN.