## RCA analysis of flight stats during Hurricane Sandy

![Hurricane Sandy](hurricane-sandy-nasa-image.jpg)

Hurricane Sandy hit the northeast coast of the United States on the 29th of October 2012 and dissipated on the 2nd of November 2012. The most affected cities were New York, Philadelphia, Boston, and Washington DC. Therefore, we're analyzing the following airports: 

**New York City:**
* John F. Kennedy International Airport (JFK)
* LaGuardia Airport (LGA)
* Newark Liberty International Airport (EWR)

**Philadelphia:**
* Philadelphia International Airport (PHL)

**Boston:**
* Logan International Airport (BOS)

**Washington D.C.:**
* Ronald Reagan Washington National Airport (DCA)
* Washington Dulles International Airport (IAD)
* Baltimore/Washington International Thurgood Marshall Airport (BWI)

### Hypotheses: 
1. Hurricane Sandy caused a drastic increase in flight cancellations from the 29th of October 2012 to 2nd of November 2012 from the following airports: JFK, LGA, EWR, PHL BOS, DCA, IAD, BWI.  
2. On the 29th of October 2012, most flights landing in airports JFK, LGA, EWR, PHL BOS, DCA, IAD, BWI were diverted. 
3. By comparing weather and flight data from 2011 and 2012, it's apparent that Hurricane Sandy was the primary cause for a large amount of flight cancellations in 2012. 

In [5]:
# Import all necessary libraries
import pandas as pd
import requests
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
import time
import json
import numpy as np
import psycopg2 # needed to get database exception errors when uploading dataframe
from zipfile import * # package for unzipping zip files
from sql_functions import get_engine
load_dotenv()

True

### 2012

In [6]:
#Weather data for 2012
#API URL and headers
#Weather data for 2011
url = 'https://meteostat.p.rapidapi.com/point/daily'
headers = {
   "x-rapidapi-host": 'meteostat.p.rapidapi.com',
   "x-rapidapi-key": os.getenv('x-rapidapi-key')  # Ensure this environment variable is set
}

#airports and their coordinates
airports = {
    "JFK": {"lat": 40.6413, "lon": -73.7781},  # John F. Kennedy International Airport
    "LGA": {"lat": 40.7769, "lon": -73.8740},  # LaGuardia Airport
    "EWR": {"lat": 40.6895, "lon": -74.1745},  # Newark Liberty International Airport
    "PHL": {"lat": 39.8729, "lon": -75.2437},  # Philadelphia International Airport
    "BOS": {"lat": 42.3656, "lon": -71.0096},  # Boston Logan International Airport
    "DCA": {"lat": 38.8512, "lon": -77.0402},  # Ronald Reagan Washington National Airport
    "IAD": {"lat": 38.9531, "lon": -77.4565},  # Washington Dulles International Airport
    "BWI": {"lat": 39.1754, "lon": -76.6684}   # Baltimore/Washington International Thurgood Marshall Airport
}

#date range
start_date = "2012-10-01"
end_date = "2012-11-30"
weather_data_2012 = []

for airport_code, coordinates in airports.items():
    parameters = {
        "lat": coordinates["lat"],
        "lon": coordinates["lon"],
        "start": start_date,
        "end": end_date,
        "units": "metric"
    }
    time.sleep(1) 
    response = requests.get(url, headers=headers, params=parameters)

    if response.status_code == 200:
        data = response.json()
        for daily_data in data['data']:
            daily_data['airport_code'] = airport_code
            weather_data_2012.append(daily_data)
    else:
        print(f"Error fetching data for {airport_code}: {response.status_code} - {response.text}") 

#list of dictionaries to a DataFrame
weather_df_2012 = pd.DataFrame(weather_data_2012)


print(weather_df_2012.head())

         date  tavg  tmin  tmax  prcp  snow   wdir  wspd  wpgt    pres  tsun  \
0  2012-10-01  17.1  12.2  22.2   0.0   0.0  262.0  18.4  None  1012.2  None   
1  2012-10-02  18.9  17.2  21.7   8.4   0.0    NaN   9.4  None  1015.8  None   
2  2012-10-03  20.3  18.3  23.3   0.0   0.0    NaN   6.1  None  1017.7  None   
3  2012-10-04  20.8  19.4  23.9  10.4   0.0    NaN   5.4  None  1019.3  None   
4  2012-10-05  20.7  16.1  25.0   0.0   0.0    NaN  12.6  None  1016.8  None   

  airport_code  
0          JFK  
1          JFK  
2          JFK  
3          JFK  
4          JFK  


In [None]:
#Cleaning steps for 2012
#Dropping empty columns
weather_df_2012 = weather_df_2012.drop(columns=['wpgt', 'tsun'])

#Filling missing values in wspd with 0
weather_df_2012['wspd'].fillna(0, inplace=True)

#Convert date column to datetime
weather_df_2012['date'] = pd.to_datetime(weather_df_2012['date']) 

In [None]:
# Write records stored in a dataframe to SQL database
table_name = 'weather_data_2012'
schema = 'cgn_analytics_24_3'
engine = get_engine()

if engine!=None:
    try:
        weather_df_2012.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

In [None]:
#Avg wind speed 2012
plt.figure(figsize=(14, 7))
sns.lineplot(data=weather_df_2012, x='date', y='wspd', hue='airport_code', marker='o')
plt.title('Avg Wind Speed Oct - Nov 2012')
plt.xlabel('Date')
plt.ylabel('Avg Wind Speed (km/h)')
dates = weather_df_2012['date'].unique()
plt.xticks(dates[::2], rotation=45)
plt.legend(title='Airport Code')
plt.show()

In [None]:
diversions_df = get_dataframe('''select *, flight_date as date
                   from cgn_analytics_24_3.diverted_per_day_2012_sandy''')

diversions_df['date'] = pd.to_datetime(diversions_df['date'])
diversions_2012 = diversions_df.drop(columns=['flight_date'])

In [None]:
daily_avg_diversions = diversions_2012.groupby('date')['total_diverted'].mean().reset_index()
daily_avg_diversions

In [None]:
#Avg daily diversions 
plt.figure(figsize=(14, 7))
plt.plot(daily_avg_diversions['date'], daily_avg_diversions['total_diverted'], marker='o')
plt.title('Daily Average Total Diversions Over Time (2012)')
plt.xlabel('Date')
plt.ylabel('Average Total Diversions')
dates = weather_df_2012['date'].unique()
plt.xticks(dates[::2], rotation=45)
plt.grid(False)
plt.show()

In [None]:
from sql_functions import get_dataframe

In [None]:
df_cancellations = get_dataframe('''select * 
                   from cgn_analytics_24_3.cancellations_per_day_2012_sandy''')
df_cancellations.rename(columns= {'flight_date': 'date'}, inplace=True)
df_cancellations.rename(columns= {'origin': 'airport_code'}, inplace=True)
df_cancellations.tail(10)

In [None]:
plt.figure(figsize=(20, 9))
sns.lineplot(data=df_cancellations, x='date', y='cancellation_percentage', marker='o')
plt.xticks(df_cancellations.date[::2], rotation=45)
plt.title('Daily avg procent of flights cancellations in October-November 2012')
plt.xlabel('Date')
plt.ylabel('Percent of cancelled flights')
#plt.legend(title= 'Origin Airport Code')
#plt.grid(True)

plt.show()

In [None]:
df_weather = get_dataframe('select * from cgn_analytics_24_3.weather_data_2012')
df_weather['date'] = pd.to_datetime(df_weather['date'])
df_weather

In [None]:
df_w_c = pd.merge(df_cancellations, df_weather, on=['date', 'airport_code'] )
df_w_c

In [None]:
plt.figure(figsize=(20, 9))
sns.lineplot(data=df_cancellations, x='flight_date', y='cancellation_percentage', hue='origin', marker='o')
plt.xticks(df_cancellations.flight_date[::16], rotation=45)
plt.title('Flights cancellations October-November 2012')
plt.xlabel('Date')
plt.ylabel('Percentage of cancelled flights')
plt.legend(title= 'Origin Airport Code')
plt.grid(True)

plt.show()

In [None]:
plt.figure(figsize=(20, 9))
sns.lineplot(data=df_w_c, x='date', y='cancellation_percentage', marker='o')
plt.xticks(df_w_c.date[::16], rotation=45)
plt.title('Flights cancellations October-November 2012')
plt.xlabel('Date')
plt.ylabel('Percentage of cancelled flights')
plt.legend(title= 'Origin Airport Code')
plt.grid(True)

plt.show()

In [None]:
df_diverted = get_dataframe('select * from cgn_analytics_24_3.diverted_per_day_2012_sandy')
df_diverted.head()

In [None]:
plt.figure(figsize=(20, 9))
sns.lineplot(data=df_diverted, x='flight_date', y='diverted_percentage', hue='dest', marker='o')
plt.xticks(df_diverted.flight_date[::16], rotation=45)
plt.title('Flights divertions October-November 2012')
plt.xlabel('Date')
plt.ylabel('Percentage of diverted flights')
plt.legend(title= 'Origin Airport Code')
#plt.grid(True)

plt.show()

In [None]:
df_weather = get_dataframe('select * from cgn_analytics_24_3.weather_data_2012')
df_weather.tail(10)

In [None]:
plt.figure(figsize=(20, 9))
sns.lineplot(data = df_w_c, x='date', y= 'wspd', marker='o', label='Wind Speed')
sns.lineplot(data = df_w_c, x='date', y= 'prcp', marker='o', label='Precipitation')
sns.lineplot(data= df_w_c, x='date', y='cancellation_percentage', marker='o', label='Cancellation Percentage')
plt.xticks(df_w_c.date[::16], rotation=45)
plt.title('Flights cancellation to weather conditions October-November 2012')
plt.xlabel('Date')
plt.ylabel('Value')
#plt.legend(title='Wind speed')

#plt.grid(True)

plt.show()

In [None]:
plt.figure(figsize=(20, 9))
sns.lineplot(data=df_weather, x='date', y='pres', marker='o')
#sns.lineplot(data = df_w_c, x='date', y= 'cancellation_percentage', marker='o')

plt.xticks(df_weather.date[::4], rotation=45)
plt.title('Air pressure October-November 2012')
plt.xlabel('Date')
plt.ylabel('Pressure')
#plt.legend(title= 'Origin Airport Code')
#plt.grid(True)

plt.show()

In [None]:
pressure_avg_daily = df_weather.groupby('date')['pres'].mean().reset_index()

pressure_avg_daily['date'] = pd.to_datetime(pressure_avg_daily['date'])
pressure_avg_daily.info()

In [None]:
pres_cancellations_df = pd.merge(pressure_avg_daily, cancellations_grouped_df, on='date')
pres_cancellations_df.head()

In [None]:
plt.figure(figsize=(20, 9))
sns.lineplot(data=pres_cancellations_df, x='date', y='pres', marker='o')
sns.lineplot(data=pres_cancellations_df, x='date', y='total_cancellations', marker='o')
plt.xticks(df_weather.date[::3], rotation=45)
#plt.title('Flights divertions October-November 2012')
plt.xlabel('Date')
plt.ylabel('value')
#plt.legend(title= 'Origin Airport Code')
plt.grid(True)

plt.show()

### 2011

In [None]:
#Weather data for 2011
#API URL and headers
url = 'https://meteostat.p.rapidapi.com/point/daily'
headers = {
   "x-rapidapi-host": 'meteostat.p.rapidapi.com',
   "x-rapidapi-key": os.getenv('x-rapidapi-key')  # Ensure this environment variable is set
}

#airports and their coordinates
airports = {
    "JFK": {"lat": 40.6413, "lon": -73.7781},  # John F. Kennedy International Airport
    "LGA": {"lat": 40.7769, "lon": -73.8740},  # LaGuardia Airport
    "EWR": {"lat": 40.6895, "lon": -74.1745},  # Newark Liberty International Airport
    "PHL": {"lat": 39.8729, "lon": -75.2437},  # Philadelphia International Airport
    "BOS": {"lat": 42.3656, "lon": -71.0096},  # Boston Logan International Airport
    "DCA": {"lat": 38.8512, "lon": -77.0402},  # Ronald Reagan Washington National Airport
    "IAD": {"lat": 38.9531, "lon": -77.4565},  # Washington Dulles International Airport
    "BWI": {"lat": 39.1754, "lon": -76.6684}   # Baltimore/Washington International Thurgood Marshall Airport
}

#date range
start_date = "2011-10-01"
end_date = "2011-11-30"
weather_data_2011 = []

for airport_code, coordinates in airports.items():
    parameters = {
        "lat": coordinates["lat"],
        "lon": coordinates["lon"],
        "start": start_date,
        "end": end_date,
        "units": "metric"
    }
    time.sleep(1)
    response = requests.get(url, headers=headers, params=parameters)

    if response.status_code == 200:
        data = response.json()
        for daily_data in data['data']:
            daily_data['airport_code'] = airport_code
            weather_data_2011.append(daily_data)
    else:
        print(f"Error fetching data for {airport_code}: {response.status_code} - {response.text}") 

#list of dictionaries to a DataFrame
weather_df_2011 = pd.DataFrame(weather_data_2011)

In [None]:
#Cleaning steps for 2011
#Dropping empty columns
weather_df_2011 = weather_df_2011.drop(columns=['wpgt', 'tsun'])

#Filling missing values in wspd with 0
weather_df_2011['wspd'].fillna(0, inplace=True)

#Convert date column to datetime
weather_df_2011['date'] = pd.to_datetime(weather_df_2011['date'])  

In [None]:
# Write records stored in a dataframe to SQL database
table_name = 'weather_data_2011'
schema = 'cgn_analytics_24_3'
engine = get_engine()

if engine!=None:
    try:
        weather_df_2011.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

In [None]:
path ='data/' 

In [None]:
def download_data(year, month):
    # Get the file from the website https://transtats.bts.gov
    zip_file = f'On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{year}_{month}.zip'
    url = (f'https://transtats.bts.gov/PREZIP/{zip_file}')
    # Download the database
    r = requests.get(f'{url}', verify=False)
    # Save database to local file storage
    with open(path+zip_file, 'wb') as f:
        f.write(r.content)
        print(f'--> zip_file with name: {zip_file} downloaded succesfully.' )

In [None]:
def extract_zip(year, month):
    # Get the file from the website https://transtats.bts.gov
    zip_file = f'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_{year}_{month}.zip'
    with ZipFile(path+zip_file, 'r') as zip_ref:
        zip_ref.extractall(path)
        csv_file =  zip_ref.namelist()[0]
        print(f'--> zip_file was succesfully extracted to: {csv_file}.' )

In [None]:
years_list = [2011] # list of years you want to look at (can of course also be a single year)
months_list = [10, 11] # list of months you want to look at (can of course also be a single month)

# download flights data as zipfile(s)
# we use a nested loop to specify the years and months to define the range of the data we would like to have 
for year in years_list:
    for month in months_list:
        download_data(year, month)
        extract_zip(year, month)

In [None]:
csv_file_10_11 = 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2011_10.csv'

# Read in your data
df_oct_2011 = pd.read_csv(path+csv_file_10_11, low_memory = False)
display(df_oct_2011.shape)
display(df_oct_2011.head())

csv_file_11_11 = 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2011_11.csv'

# Read in your data
df_nov_2011 = pd.read_csv(path+csv_file_11_11, low_memory = False)
display(df_nov_2011.shape)
display(df_nov_2011.head())

In [None]:
# Columns from downloaded file that are to be kept

columns_to_keep = [
                'FlightDate',
                'DepTime',
                'CRSDepTime',
                'DepDelay',
                'ArrTime',
                'CRSArrTime',
                'ArrDelay',
                'Reporting_Airline',
                'Tail_Number',
                'Flight_Number_Reporting_Airline',
                'Origin',
                'Dest',
                'AirTime',
                'ActualElapsedTime',
                'Distance',
                'Cancelled',
                'Diverted'
]

In [None]:
schema = 'cgn_analytics_24_3' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = get_engine() # assign engine to be able to query against the database

table_name_sql = f'''SELECT COLUMN_NAME 
                    FROM INFORMATION_SCHEMA.COLUMNS 
                    WHERE TABLE_NAME = 'flights'
                    AND TABLE_SCHEMA ='{schema}'
                    ORDER BY ordinal_position'''
c_names = engine.execute(table_name_sql).fetchall()
c_names

In [None]:
new_column_names=[]
for name in c_names:
    new_column_names.append(name[0])
new_column_names     

In [None]:
def clean_airline_df(df):
    '''
    Transforms a df made from BTS csv file into a df that is ready to be uploaded to SQL
    Set rows=0 for no filtering
    '''

    # Build dataframe including only the columns you want to keep
    df_airline = df.loc[:,columns_to_keep]
     
    # Clean data types and NULLs
    df_airline['FlightDate']= pd.to_datetime(df_airline['FlightDate'], yearfirst=True)
    df_airline['CRSArrTime']= pd.to_numeric(df_airline['CRSArrTime'], downcast='integer', errors='coerce')
    df_airline['Cancelled']= pd.to_numeric(df_airline['Cancelled'], downcast='integer')
    df_airline['Diverted']= pd.to_numeric(df_airline['Diverted'], downcast='integer')
    df_airline['ActualElapsedTime']= pd.to_numeric(df_airline['ActualElapsedTime'], downcast='integer', errors='coerce')
    
    # Rename columns
    df_airline.columns = new_column_names
    
    return df_airline

In [None]:
print(clean_airline_df)

In [None]:
print(new_column_names)

In [None]:
df_oct_2011_clean = clean_airline_df(df_oct_2011)
df_oct_2011_clean.head()

In [None]:
df_nov_2011_clean = clean_airline_df(df_nov_2011)
df_nov_2011_clean.head()

In [None]:
def select_airport(df, airports):
    ''' Helper function for filtering the airline dataframe for a subset of airports'''
    df_out = df.loc[(df.origin.isin(airports)) | (df.dest.isin(airports))]
    return df_out


airports=['JFK', 'LGA', 'EWR', 'PHL', 'BOS', 'DCA', 'IAD', 'BWI']

In [None]:
if len(airports) > 0:
    df_oct_2011_selected_airports = select_airport(df_oct_2011_clean, airports)
else:
    df_selected_airports = df_oct_2011_clean
df_oct_2011_selected_airports.info()

In [None]:
if len(airports) > 0:
    df_nov_2011_selected_airports = select_airport(df_nov_2011_clean, airports)
else:
    df_selected_airports = df_nov_2011_clean
df_nov_2011_selected_airports.info()

In [None]:
combined_df = pd.concat([df_oct_2011_selected_airports, df_nov_2011_selected_airports])
#combined_df['flight_date'] = pd.to_datetime(combined_df['date'])
#start_date = '2011-10-22'
#end_date = '2011-11-03'
#filtered_df = combined_df[(combined_df['flight_date'] >= start_date) & (combined_df['flight_date'] <= end_date)]

combined_df.reset_index(drop=True, inplace=True )
#filtered_df.tail(15)
combined_df.tail(10)

In [None]:
from sql_functions import get_engine

In [None]:
table_name = 'flights_oct_nov_2011_sandy'
engine = get_engine()
schema = 'cgn_analytics_24_3'

# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        combined_df.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

In [None]:
from sql_functions import get_dataframe

In [None]:
df = get_dataframe('''select * 
                   from cgn_analytics_24_3.flights_oct_nov_2011_sandy''')

df.head(20)

In [None]:
def get_airport_data():
    # Define the airports and date range
    airports = ['JFK', 'LGA', 'EWR', 'PHL', 'BOS', 'DCA', 'IAD', 'BWI']
    start_date = '2011-10-01'
    end_date = '2011-12-01'

    # Construct the SQL query
    query = f'''
    SELECT *
    FROM cgn_analytics_24_3.flights_oct_nov_2011_sandy
    WHERE origin IN ({', '.join([f"'{airport}'" for airport in airports])})
      AND flight_date >= '{start_date}'
      AND flight_date <= '{end_date}'
    '''

    # Execute the query and fetch the results into a dataframe
    df = get_dataframe(query)

    return df

# Call the function to get the airport data
airport_data = get_airport_data()

# Print the resulting dataframe
print(airport_data)

### Plotting with separated airports - 2011

In [None]:
def get_airport_data():
    # Define the airports and date range
    airports = ['JFK', 'LGA', 'EWR', 'PHL', 'BOS', 'DCA', 'IAD', 'BWI']
    start_date = '2011-10-01'
    end_date = '2011-12-01'

    # Construct the SQL query
    query = f'''
    SELECT *
    FROM cgn_analytics_24_3.flights_oct_nov_2011_sandy
    WHERE origin IN ({', '.join([f"'{airport}'" for airport in airports])})
      AND flight_date >= '{start_date}'
      AND flight_date <= '{end_date}'
    '''

    # Assuming you have a method to execute the SQL query and fetch the results into a dataframe
    # Replace 'execute_query_and_fetch_dataframe' with the actual method you use to execute the query
    df_airport = get_dataframe(query)

    return df_airport

def get_weather_data():
    # Define date range
    start_date = '2011-10-01'
    end_date = '2011-12-01'

    # Construct the SQL query
    query = f'''
    SELECT date, wspd
    FROM cgn_analytics_24_3.weather_data_2011
    WHERE date >= '{start_date}'
      AND date <= '{end_date}'
    '''

    # Assuming you have a method to execute the SQL query and fetch the results into a dataframe
    # Replace 'execute_query_and_fetch_dataframe' with the actual method you use to execute the query
    df_weather = get_dataframe(query)

    return df_weather

# Call the functions to get the dataframes
airport_data = get_airport_data()
weather_data = get_weather_data()

# Merge the dataframes on the common column names
merged_data = pd.merge(airport_data, weather_data, left_on='flight_date', right_on='date')

# Plot the flight delays and weather wind speed in one plot

plt.figure(figsize=(14, 7))
sns.lineplot(data=df_weather, x='date', y='wspd', hue='airport_code', marker='o')
plt.axhline(y=30, color='r', linestyle='--', label='30 kts')
plt.title('Wind Speed Oct - Nov 2011')
plt.xlabel('Date')
plt.ylabel('Wind Speed (kts)')
dates = df_weather['date'].unique()
plt.xticks(dates[::2], rotation=45)
plt.legend(title='Airport Cancellations', fontsize='small')
plt.annotate('Take off prohibited', xy=(20, 30), xytext=(18, 36),
             arrowprops=dict(facecolor='red', shrink=0.05))
plt.show()

### Plotting with an average for all 8 airports combined - 2011

In [None]:
# Define the functions to retrieve airport and weather data
# (Assuming you have already defined these functions)

# Call the functions to get the dataframes
airport_data = get_airport_data()
weather_data = get_weather_data()

# Merge the dataframes on the common column names
merged_data = pd.merge(airport_data, weather_data, left_on='flight_date', right_on='date')

# Filter the merged data to include only the specified airports
selected_airports = ['JFK', 'LGA', 'EWR', 'PHL', 'BOS', 'DCA', 'IAD', 'BWI']
filtered_data = merged_data[(merged_data['origin'].isin(selected_airports)) | (merged_data['dest'].isin(selected_airports))]

# Calculate the average cancellations for all selected airports
average_cancellations = filtered_data['cancelled'].mean()

# Plot the flight delays and weather wind speed in one plot
plt.figure(figsize=(14, 7))
sns.lineplot(data=weather_data, x='date', y='wspd', marker='o')
#plt.axhline(y=30, color='r', linestyle='--')
plt.title('Average Cancellations for JFK, LGA, EWR, PHL, BOS, DCA, IAD, BWI combined')
plt.xlabel('Date')
plt.ylabel('Average Cancellation Percentage')
dates = weather_data['date'].unique()
plt.xticks(dates[::2], rotation=45)
plt.legend(title='Average Airport Cancellations', fontsize='small', loc='right')