# Prepare Workspace

In [19]:
import pandas as pd
import numpy as np

In [None]:
# Load datasets explicitly
airports = pd.read_csv('/Users/michael/Desktop/CS506 Project/Test/Airports.csv', skiprows=1)
flights = pd.read_csv('/Users/michael/Desktop/CS506 Project/Test/Flights.csv')
weather = pd.read_csv('/Users/michael/Desktop/CS506 Project/Test/Weather.csv')

# Merge Step 1: Clean and Prepare Airports and Weather Datasets

In [7]:
# Explicitly clean and prepare Airports DataFrame to map ICAO to IATA clearly
airport_codes = airports[['icao', 'code']].copy()
airport_codes.columns = ['ICAO', 'IATA']
airport_codes['ICAO'] = airport_codes['ICAO'].str.strip().str.upper()
airport_codes['IATA'] = airport_codes['IATA'].str.strip().str.upper()

# Clean Weather dataset explicitly and prepare ICAO and date columns
weather['AirportCode'] = weather['AirportCode'].str.strip().str.upper()
weather['Date'] = pd.to_datetime(weather['StartTime(UTC)']).dt.date

# Merge Step 2: Merge Weather and Airports for IATA Codes

In [8]:
# Merge weather explicitly with airports to get IATA codes
weather_with_iata = weather.merge(
    airport_codes,
    left_on='AirportCode',
    right_on='ICAO',
    how='left'
)

# Merge Step 3: Clean Flights Dataset

In [9]:
# Clean flights dataset explicitly
flights['ORIGIN'] = flights['ORIGIN'].str.strip().str.upper()
flights['DEST'] = flights['DEST'].str.strip().str.upper()
flights['Date'] = pd.to_datetime(flights['FL_DATE']).dt.date

# Merge Step 4: Merge Weather and Flight Datasets

In [10]:
# Merge weather data explicitly with flights data based on ORIGIN airport and Date
merged_origin_weather = flights.merge(
    weather_with_iata,
    left_on=['ORIGIN', 'Date'],
    right_on=['IATA', 'Date'],
    how='left',
    suffixes=('', '_origin_weather')
).rename(columns={
    'Type': 'WeatherType_Origin',
    'Severity': 'Severity_Origin',
    'Precipitation(in)': 'Precipitation_Origin'
})

# Merge weather data explicitly again with flights data for DEST airport
merged_full_weather = merged_origin_weather.merge(
    weather_with_iata,
    left_on=['DEST', 'Date'],
    right_on=['IATA', 'Date'],
    how='left',
    suffixes=('', '_dest_weather')
).rename(columns={
    'Type': 'WeatherType_Dest',
    'Severity': 'Severity_Dest',
    'Precipitation(in)': 'Precipitation_Dest'
})

# Merge Step 5: Clean and Finalise

In [16]:
# Keep only relevant columns
columns_to_keep = ['FL_DATE', 'AIRLINE', 'ORIGIN_CITY', 'DEST_CITY', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'CANCELLATION_CODE', 'AIR_TIME', 'DISTANCE', 'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT', 'WeatherType_Origin', 'Severity_Origin', 'StartTime(UTC)', 'EndTime(UTC)', 'Precipitation_Origin', 'WeatherType_Dest', 'Severity_Dest', 'StartTime(UTC)_dest_weather', 'EndTime(UTC)_dest_weather', 'Precipitation_Dest']
final_dataset = merged_full_weather[columns_to_keep]
time_columns = ['StartTime(UTC)', 'EndTime(UTC)', 'StartTime(UTC)_dest_weather', 'EndTime(UTC)_dest_weather']

# Fix time formatting
for col in time_columns:
    final_dataset[col] = pd.to_datetime(final_dataset[col], errors='coerce').dt.time

# Fill text/categorical columns
categorical_fill = ['CANCELLATION_CODE', 'WeatherType_Origin', 'Severity_Origin', 'WeatherType_Dest', 'Severity_Dest']
final_dataset[categorical_fill] = final_dataset[categorical_fill].fillna('None')

# Fill numerical delay reasons and precipitation
numerical_zero_fill = ['DELAY_DUE_LATE_AIRCRAFT', 'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 'Precipitation_Origin', 'Precipitation_Dest']
final_dataset[numerical_zero_fill] = final_dataset[numerical_zero_fill].fillna(0)

# Fill AIR_TIME, ARR_TIME, DEP_TIME, ARR_DELAY, DEP_DELAY with -1 to flag missing
time_cols_fill = ['AIR_TIME', 'ARR_TIME', 'DEP_TIME', 'ARR_DELAY', 'DEP_DELAY']
final_dataset[time_cols_fill] = final_dataset[time_cols_fill].fillna(-1)

# Fill StartTime(UTC) and EndTime(UTC) weather columns with 'None'
time_weather_fill = ['StartTime(UTC)', 'EndTime(UTC)', 'StartTime(UTC)_dest_weather', 'EndTime(UTC)_dest_weather']
final_dataset[time_weather_fill] = final_dataset[time_weather_fill].fillna('None')

# Create binary delay label
final_dataset['DELAYED'] = (final_dataset['ARR_DELAY'] >= 15).astype(int)

# Save explicitly the final cleaned merged dataset
final_dataset.to_csv('final_flights_weather_merged.csv', index=False)

# Explicitly verify the merged dataset clearly
print("✅ Successfully merged Flights and Weather datasets. Here's a preview:")
print(final_dataset.head())
print(f"\nFinal merged dataset dimensions: {final_dataset.shape}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset[col] = pd.to_datetime(final_dataset[col], errors='coerce').dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset[col] = pd.to_datetime(final_dataset[col], errors='coerce').dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset[col] = pd.to_datetime(final_d

✅ Successfully merged Flights and Weather datasets. Here's a preview:
      FL_DATE                AIRLINE          ORIGIN_CITY    DEST_CITY  \
0  2019-01-09  United Air Lines Inc.  Fort Lauderdale, FL   Newark, NJ   
1  2022-11-19   Delta Air Lines Inc.      Minneapolis, MN  Seattle, WA   
2  2022-11-19   Delta Air Lines Inc.      Minneapolis, MN  Seattle, WA   
3  2022-11-19   Delta Air Lines Inc.      Minneapolis, MN  Seattle, WA   
4  2022-11-19   Delta Air Lines Inc.      Minneapolis, MN  Seattle, WA   

   CRS_DEP_TIME  DEP_TIME  DEP_DELAY  CRS_ARR_TIME  ARR_TIME  ARR_DELAY  ...  \
0          1155    1151.0       -4.0          1501    1447.0      -14.0  ...   
1          2120    2114.0       -6.0          2315    2310.0       -5.0  ...   
2          2120    2114.0       -6.0          2315    2310.0       -5.0  ...   
3          2120    2114.0       -6.0          2315    2310.0       -5.0  ...   
4          2120    2114.0       -6.0          2315    2310.0       -5.0  ...   

  Se

# Train Step 1: Split into Training and Testing Data

In [17]:
# Make sure FL_DATE is datetime
final_dataset['FL_DATE'] = pd.to_datetime(final_dataset['FL_DATE'])

# Define your splits
train_mask = (
    ((final_dataset['FL_DATE'] >= '2019-01-01') & (final_dataset['FL_DATE'] <= '2020-03-31')) |
    ((final_dataset['FL_DATE'] >= '2021-01-01') & (final_dataset['FL_DATE'] <= '2022-06-30'))
)

test_mask = (
    (final_dataset['FL_DATE'] > '2022-06-30') & (final_dataset['FL_DATE'] <= '2023-06-30')
)

# Split
train_data = final_dataset[train_mask].reset_index(drop=True)
test_data_1 = final_dataset[test_mask].reset_index(drop=True)
test_data_2 = final_dataset[test_mask].reset_index(drop=True)

print(f"Train shape: {train_data.shape}")
print(f"Test 1 shape: {test_data_1.shape}")
print(f"Test 2 shape: {test_data_2.shape}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['FL_DATE'] = pd.to_datetime(final_dataset['FL_DATE'])


Train shape: (8896833, 29)
Test 1 shape: (1662195, 29)
Test 2 shape: (1662195, 29)


# Train Step 2: Adjust Test 2 for Climate Data

In [20]:
# Define Parameters
future_precip_increase = 1.10  # +10% precipitation overall
heavy_precip_threshold = 0.3   # Threshold for heavy precipitation in inches
heavy_event_intensity_increase = 1.10  # +10% more rain on heavy days
frequency_boost_percent = 16  # +16% more rainy days
new_precip_range = (0.05, 0.1)  # Inches for new light rain days

def remap_severity(df, precip_col, severity_col):
    """
    Remaps severity based on precipitation values.
    """
    conditions = [
        (df[precip_col].isna()) | (df[precip_col] == 0),
        (df[precip_col] > 0) & (df[precip_col] <= 0.1),
        (df[precip_col] > 0.1) & (df[precip_col] <= 0.5),
        (df[precip_col] > 0.5) & (df[precip_col] <= 1.0),
        (df[precip_col] > 1.0)
    ]
    choices = [np.nan, 'Light', 'Moderate', 'Heavy', 'Severe']
    
    df[severity_col] = np.select(conditions, choices, default=np.nan)

# Adjust Precipitation on test_data_2
adjusted_test_data_2 = test_data_2.copy()

# Apply +10% to all precipitation values
for col in ['Precipitation_Origin', 'Precipitation_Dest']:
    if col in adjusted_test_data_2.columns:
        adjusted_test_data_2[col] = adjusted_test_data_2[col] * future_precip_increase

# Further increase heavy precipitation events
for col in ['Precipitation_Origin', 'Precipitation_Dest']:
    if col in adjusted_test_data_2.columns:
        heavy_mask = adjusted_test_data_2[col] > heavy_precip_threshold
        adjusted_test_data_2.loc[heavy_mask, col] *= heavy_event_intensity_increase

# Adjust Precipitation Frequency 
for col in ['Precipitation_Origin', 'Precipitation_Dest']:
    if col in adjusted_test_data_2.columns:
        dry_mask = adjusted_test_data_2[col] == 0.0
        dry_indices = adjusted_test_data_2[dry_mask].index

        # How many dry days to convert to light rain?
        num_to_convert = int(len(dry_indices) * (frequency_boost_percent / 100))

        if num_to_convert > 0:
            selected_indices = np.random.choice(dry_indices, size=num_to_convert, replace=False)
            adjusted_test_data_2.loc[selected_indices, col] = np.random.uniform(new_precip_range[0], new_precip_range[1], size=num_to_convert)

# Apply remapping to Severity columns
remap_severity(adjusted_test_data_2, 'Precipitation_Origin', 'Severity_Origin')
remap_severity(adjusted_test_data_2, 'Precipitation_Dest', 'Severity_Dest')