In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load dataset
flights_data = pd.read_csv(r"C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\flights_airport_iata.csv", low_memory=False)
weather_data = pd.read_csv(r"C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\weather_iata.csv", low_memory=False)

In [3]:
print(flights_data.columns)

Index(['day_of_week', 'date', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_iata', 'origin_city', 'dest_iata',
       'dest_city', 'crs_dep_time', 'dep_time', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'cancelled',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
       'flights', 'distance', 'distance_group', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'origin_state', 'dest_state', 'iata', 'latitude', 'longitude',
       'iata_dest', 'airport_name_dest', 'latitude_dest', 'longitude_dest',
       'state_abbr_dest', 'airport_ref', 'airport_ident', 'type_of_airport',
       'airport_name', 'elevation_ft', 'origin_state.1', 'municipality',
       'scheduled_service', 'unique_id', 'id', 'length_ft', 'width_ft',
       'surface', 'lighted', 'closed', 'le_ident', 'le_displaced_threshold_ft',
       'he_ident', 'he_displaced_threshold_ft'],
      

In [4]:
print(weather_data.columns)

Index(['latitude', 'longitude', 'elevation', 'date', 'prcp', 'snow', 'snwd',
       'tmax', 'tmin', 'tobs', 'city', 'state_abbr_x', 'iata', 'airport_name',
       'state_abbr_y'],
      dtype='object')


In [5]:
# Rename columns in flights_data
flights_data = flights_data.rename(columns={
    'latitude': 'origin_latitude',
    'longitude': 'origin_longitude',
    'latitude_dest': 'dest_latitude',
    'longitude_dest': 'dest_longitude'
})

# Check the updated columns to confirm the changes
print(flights_data.columns)


Index(['day_of_week', 'date', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_iata', 'origin_city', 'dest_iata',
       'dest_city', 'crs_dep_time', 'dep_time', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'cancelled',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
       'flights', 'distance', 'distance_group', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'origin_state', 'dest_state', 'iata', 'origin_latitude',
       'origin_longitude', 'iata_dest', 'airport_name_dest', 'dest_latitude',
       'dest_longitude', 'state_abbr_dest', 'airport_ref', 'airport_ident',
       'type_of_airport', 'airport_name', 'elevation_ft', 'origin_state.1',
       'municipality', 'scheduled_service', 'unique_id', 'id', 'length_ft',
       'width_ft', 'surface', 'lighted', 'closed', 'le_ident',
       'le_displaced_threshold_ft', 'he_ident', 'he_displaced_threshol

In [90]:
# flights_data should contain at least 'origin_iata', 'dest_iata', and 'date'
# weather_data should contain 'iata', 'date', and weather columns
# Merge for origin weather data
origin_data = flights_data.merge(
    weather_data,
    left_on=['date', 'origin_iata'],
    right_on=['date', 'iata'],
    how='left',
    suffixes=('', '_origin')
)

# Check the columns after the merge
print("After merging with origin weather:")
print(combined_data.columns)

After merging with origin weather:
Index(['day_of_week', 'date', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_iata', 'origin_city', 'dest_iata',
       'dest_city', 'crs_dep_time', 'dep_time', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'cancelled',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
       'flights', 'distance', 'distance_group', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'origin_state', 'dest_state', 'iata', 'origin_latitude',
       'origin_longitude', 'iata_dest', 'airport_name_dest', 'dest_latitude',
       'dest_longitude', 'state_abbr_dest', 'airport_ref', 'airport_ident',
       'type_of_airport', 'airport_name', 'elevation_ft', 'origin_state.1',
       'municipality', 'scheduled_service', 'unique_id', 'id', 'length_ft',
       'width_ft', 'surface', 'lighted', 'closed', 'le_ident',
       'le_displaced_threshold_ft',

In [91]:
# Merge for destination weather data
combined_data = origin_data.merge(
    weather_data,
    left_on=['date', 'dest_iata'],
    right_on=['date', 'iata'],
    how='left',
    suffixes=('', '_dest')
)

# Check the columns after the second merge
print("After merging with destination weather:")
print(combined_data.columns)


After merging with destination weather:
Index(['day_of_week', 'date', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_iata', 'origin_city', 'dest_iata',
       'dest_city', 'crs_dep_time', 'dep_time', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'cancelled',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
       'flights', 'distance', 'distance_group', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'origin_state', 'dest_state', 'iata', 'origin_latitude',
       'origin_longitude', 'iata_dest', 'airport_name_dest', 'dest_latitude',
       'dest_longitude', 'state_abbr_dest', 'airport_ref', 'airport_ident',
       'type_of_airport', 'airport_name', 'elevation_ft', 'origin_state.1',
       'municipality', 'scheduled_service', 'unique_id', 'id', 'length_ft',
       'width_ft', 'surface', 'lighted', 'closed', 'le_ident',
       'le_displaced_threshold

In [92]:

# Check for NaN rows if necessary
nan_rows = combined_data[combined_data['iata'].isnull()]
print(nan_rows[['date', 'origin_iata', 'dest_iata']])


Empty DataFrame
Columns: [date, origin_iata, dest_iata]
Index: []


In [93]:
unique_columns = []

# Set to track seen columns
seen = set()

# Process columns in chunks
chunk_size = 100  # Adjust this size based on your memory and performance
for start in range(0, combined_data.shape[1], chunk_size):
    end = min(start + chunk_size, combined_data.shape[1])
    chunk = combined_data.iloc[:, start:end]
    
    # Identify duplicates in the current chunk
    for col in chunk.columns:
        if col not in seen:
            unique_columns.append(col)
            seen.add(col)

# Create a new DataFrame with only unique columns
combined_data_cleaned = combined_data[unique_columns]

# Display the cleaned DataFrame's columns
print("Columns after removing duplicate columns:")
print(combined_data_cleaned.columns)


MemoryError: Unable to allocate 1.03 GiB for an array with shape (9, 15332060) and data type float64

In [None]:

# Keep only the unique columns
combined_data_cleaned = combined_data.loc[:, ~duplicate_columns]

# Display the cleaned DataFrame's columns
print("Columns after removing duplicate columns:")
print(combined_data_cleaned.columns)

In [None]:
# Remove duplicate rows from the DataFrame
combined_data_cleaned = combined_data_cleaned.drop_duplicates()

# Display the shape of the cleaned DataFrame to check for changes
print("Shape of DataFrame after removing duplicate rows:")
print(combined_data_cleaned.shape)

# Optionally, display the first few rows to verify
print("First few rows of the cleaned DataFrame:")
print(combined_data_cleaned.head())

In [None]:
combined_data = combined_data_cleaned

In [None]:
# Inspect the data

In [None]:
print(combined_data.shape)
print(weather_data.shape())
print(flight_data.shape())

In [None]:
print(combined_data.head())

In [None]:
# Check for missing values
print(combined_data.isnull().sum())

Flights Data:
Carrier Delay:
Imputation: If you believe the missing values might be due to missing reports rather than actual absence of delay, consider imputing these values with the average delay for that airline or flight.
Drop: If the missing values are too many and could skew your analysis, consider dropping the carrier_delay column if it’s not critical for your analysis.


Weather Data:
Snow and Snow Depth:
Imputation: You can fill in missing values with 0 (assuming no snow) or use interpolation or forward/backward filling methods based on surrounding data.
Drop: If a significant portion of your analysis requires snow data and the missing values are large, consider dropping those rows or the columns if they don’t contribute significantly to your analysis.

In [None]:
# Calculate the percentage of null values for each column
null_percentage_weather = weather_data.isnull().mean() * 100

# Filter to show only columns with null values
null_percentage_weather = null_percentage_weather[null_percentage_weather > 0]

# Display the result
print(null_percentage_weather)

In [None]:
# Calculate the percentage of null values for each column
null_percentage_flights = combined_data.isnull().mean() * 100

# Filter to show only columns with null values
null_percentage_flights = null_percentage_flights[null_percentage_flights > 0]

# Display the result
print(null_percentage_flights)

In [None]:
# Check for duplicates
print("Number of combined_data duplicates: " + str(combined_data.duplicated().sum()))


Investigating duplicate rows: By conducting these analyses, I am trying to identify whether there are any underlying issues with the data that could be contributing to the duplicates and gain a clearer understanding of the variability present in the dataset.

In [None]:
# Identifying outliers

Q1 = flights_weather_df['total_delay_time'].quantile(0.25)
Q3 = flights_weather_df['total_delay_time'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
outliers = flights_weather_df[(flights_weather_df['total_delay_time'] < lower_bound) | (flights_weather_df['total_delay_time'] > upper_bound)]

plt.figure(figsize=(12, 6))
sns.boxplot(x=flights_data['total_delay_time'])
plt.title('Box Plot of Total Delay Time')
plt.axvline(0, color='red', linestyle='--')  # Line at zero for reference
plt.show()

print("Outliers:")
print(outliers)


In [None]:
# Feature engineering

# Create new features
combined_data['total_delay_time'] = combined_data['actual_elapsed_time'] - combined_data['crs_elapsed_time']
combined_data['is_weekend'] = combined_data['day_of_week'].isin([5, 6]).astype(int)  # Saturday and Sunday

# Assuming crs_arr_time and arr_time are in HHMM format, convert them to minutes
combined_data['crs_arr_time'] = combined_data['crs_arr_time'] // 100 * 60 + combined_data['crs_arr_time'] % 100
combined_data['arr_time'] = combined_data['arr_time'] // 100 * 60 + combined_data['arr_time'] % 100

# Calculate arrival delay
combined_data['arrival_delay'] = combined_data['arr_time'] - combined_data['crs_arr_time']

# Assuming crs_dep_time and dep_time are also in HHMM format, convert them similarly
combined_data['crs_dep_time'] = combined_data['crs_dep_time'] // 100 * 60 + combined_data['crs_dep_time'] % 100
combined_data['dep_time'] = combined_data['dep_time'] // 100 * 60 + combined_data['dep_time'] % 100

# Calculate departure delay
combined_data['departure_delay'] = combined_data['dep_time'] - combined_data['crs_dep_time']

# Set the plot style
sns.set(style='whitegrid')

# Create subplots for arrival and departure delays
fig, axes = plt.subplots(2, 1, figsize=(12, 12))

# Plotting Arrival Delays
sns.histplot(combined_data['arrival_delay'], bins=50, kde=True, color='blue', ax=axes[0])
axes[0].set_title('Distribution of Arrival Delays', fontsize=16)
axes[0].set_xlabel('Arrival Delay (minutes)', fontsize=14)
axes[0].set_ylabel('Frequency', fontsize=14)
axes[0].axvline(x=0, color='red', linestyle='--', label='On-Time')
# Set the limits for the x-axis
plt.xlim(-25, 25)  # Adjust based on the distribution
axes[0].legend()

# Plotting Departure Delays
sns.histplot(combined_data['departure_delay'], bins=50, kde=True, color='orange', ax=axes[1])
axes[1].set_title('Distribution of Departure Delays', fontsize=16)
axes[1].set_xlabel('Departure Delay (minutes)', fontsize=14)
axes[1].set_ylabel('Frequency', fontsize=14)
axes[1].axvline(x=0, color='red', linestyle='--', label='On-Time')
# Set the limits for the x-axis
plt.xlim(-25, 25)  # Adjust based on the distribution
axes[1].legend()

# Show the plots
plt.tight_layout()
plt.show()

In [None]:
# View all duplicate rows
duplicate_rows = weather_data[weather_data.duplicated(keep=False)]
print(duplicate_rows)

In [None]:
# Count duplicates and sort by count
duplicate_counts = weather_data[weather_data.duplicated(keep=False)].groupby(weather_data.columns.tolist()).size().reset_index(name='count')
duplicate_counts = duplicate_counts.sort_values(by='count', ascending=False)
print(duplicate_counts)

In [None]:
print(duplicate_rows[['date', 'iata', 'prcp', 'tmax', 'tmin']].head(20))


In [None]:
print(weather_data.dtypes)

In [None]:
print(duplicate_rows.describe())

In [None]:
sns.boxplot(data=duplicate_rows, x='tmax')
plt.title('Distribution of Max Temperature in Duplicates')
plt.show()

sns.histplot(duplicate_rows['prcp'], bins=30)
plt.title('Histogram of Precipitation in Duplicates')
plt.show()

In [None]:

# Group the duplicates by the key columns (e.g., date, iata) and aggregate to see how values differ.
grouped_duplicates = duplicate_rows.groupby(['date', 'iata']).agg({'prcp': 'mean', 'tmax': 'mean', 'tmin': 'mean', 'snow': 'mean'}).reset_index()
print(grouped_duplicates)

In [None]:
# Check how many unique values exist for certain columns within the duplicates to see if there’s variability.
print(duplicate_rows.nunique())

In [None]:
# View all duplicate rows
duplicate_rows = flights_data[flights_data.duplicated(keep=False)]
print(duplicate_rows)

In [None]:
# # Identify which iata codes have the most variability in latitude and longitude.

# # Check the columns in duplicate_rows
# print(duplicate_rows.columns)

# # Ensure 'latitude' and 'longitude' are present
# if 'latitude' in duplicate_rows.columns and 'longitude' in duplicate_rows.columns:
#     # Group by IATA code and count unique latitude and longitude values
#     lat_lon_variability = duplicate_rows.groupby('origin_iata').agg({
#         'latitude': 'nunique',
#         'longitude': 'nunique'
#     }).reset_index()

#     # Rename columns for clarity
#     lat_lon_variability.columns = ['iata', 'unique_latitudes', 'unique_longitudes']

#     # Filter for IATA codes with more than one unique latitude or longitude
#     variability_filter = lat_lon_variability[(lat_lon_variability['unique_latitudes'] > 1) | 
#                                              (lat_lon_variability['unique_longitudes'] > 1)]

#     print("IATA codes with variability in latitude or longitude:")
#     print(variability_filter)
# else:
#     print("Columns 'latitude' or 'longitude' do not exist in the DataFrame.")


In [None]:
# Check if certain dates have more duplicates and how they vary in terms of weather conditions.
date_variability = duplicate_rows.groupby('date').size().reset_index(name='count')
print(date_variability[date_variability['count'] > 1])

In [None]:
duplicate_rows.columns


In [None]:
# # Create plots to visualize how tmax, tmin, and prcp vary over the dates for the same iata code.

# # Ensure 'date' is in datetime format
# duplicate_rows.loc[:, 'date'] = pd.to_datetime(duplicate_rows['date'])

# # Plot Max Temperature (tmax)
# plt.figure(figsize=(12, 6))
# sns.lineplot(data=duplicate_rows, x='date', y='tmax', hue='origin_iata', marker='o')
# plt.title('Max Temperature Over Time for Duplicate Entries')
# plt.xlabel('Date')
# plt.ylabel('Max Temperature (°F)')
# plt.xticks(rotation=45)
# plt.legend(title='IATA Code')
# plt.tight_layout()
# plt.show()



In [None]:

# # Plot Min Temperature (tmin)
# plt.figure(figsize=(12, 6))
# sns.lineplot(data=duplicate_rows, x='date', y='tmin', hue='origin_iata', marker='o')
# plt.title('Min Temperature Over Time for Duplicate Entries')
# plt.xlabel('Date')
# plt.ylabel('Min Temperature (°F)')
# plt.xticks(rotation=45)
# plt.legend(title='IATA Code')
# plt.tight_layout()
# plt.show()

# # Plot Precipitation (prcp)
# plt.figure(figsize=(12, 6))
# sns.lineplot(data=duplicate_rows, x='date', y='prcp', hue='origin_iata', marker='o')
# plt.title('Precipitation Over Time for Duplicate Entries')
# plt.xlabel('Date')
# plt.ylabel('Precipitation (inches)')
# plt.xticks(rotation=45)
# plt.legend(title='IATA Code')
# plt.tight_layout()
# plt.show()


In [None]:
# # Ensure that each iata code consistently points to the same airport name, latitude, longitude, and elevation.
# consistency_check = duplicate_rows.groupby('iata')[['latitude', 'longitude', 'elevation']].nunique()
# print(consistency_check)


In [None]:
# Handle Missing Values: fill or drop missing values (e.g., mean, median, mode, or dropping rows/columns)

# Drop duplicate rows, keeping the first occurrence
weather_data = weather_data.drop_duplicates(keep='first')

# Reset the index after dropping duplicates
weather_data.reset_index(drop=True, inplace=True)

# Display the cleaned DataFrame
print(weather_data.head())

In [None]:
# Drop duplicate rows, keeping the first occurrence
flights_data = flights_data.drop_duplicates(keep='first')

# Rreset the index after dropping duplicates
flights_data.reset_index(drop=True, inplace=True)

# Display the cleaned DataFrame
print(flights_data.head())

In [None]:
# Check for duplicates
print("Number of flights_data duplicates: " + str(flights_data.duplicated().sum()))
print("Number of weather_data duplicates: " + str(weather_data.duplicated().sum()))

Investigating missing values

In [None]:
# flight_data
# High missing values for carrier_delay, weather_delay, nas_delay, 
# security_delay, and late_aircraft_delay, but we are keeping 
# due to nature of information.
# Moderate Missing Values: tail_num, dep_time, taxi_out, etc.: 
# These have around 1-3% missing values. Filling these with 
# the mean or median, as they are likely to still provide valuable 
# information.

In [None]:
# Impute missing values for numerical variables using median
for col in ['dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 
            'arr_time', 'actual_elapsed_time', 'air_time']:
    flights_data[col] = flights_data[col].fillna(flights_data[col].median())

# Impute missing values for the categorical variable
flights_data['tail_num'] = flights_data['tail_num'].fillna(flights_data['tail_num'].mode()[0])

In [None]:
# Calculate the percentage of null values for each column
null_percentage_flights = flights_data.isnull().mean() * 100

# Filter to show only columns with null values
null_percentage_flights = null_percentage_flights[null_percentage_flights > 0]

# Display the result
print(null_percentage_flights)

In [None]:
# Look at how these delays correlate with each other
correlation_matrix = flights_data[['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].corr()
print(correlation_matrix)


Since the correlations are low, it implies that these types of delays do not influence each other significantly. For instance, an increase in carrier_delay does not correlate with an increase or decrease in weather_delay, nas_delay, etc.

Investigate Further: If you’re looking for potential factors influencing these delays, consider examining:

Time of year (seasonality)
Day of the week
Specific routes or airlines
Visual Analysis: Visualizing these relationships can also provide insights:

Scatter plots could help visualize the relationship between two delay types, even if correlations are low.
Boxplots can show the distribution of delays.

In [None]:
print(flights_data.columns)


In [None]:
# Check unique airports reporting any delays
delay_columns = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
airports_with_delays = flights_data[delay_columns].notnull().any(axis=1)
unique_airports = flights_data[airports_with_delays]['origin_iata'].unique() #origin airport
print(f"Unique airports reporting delays: {len(unique_airports)}")
print(unique_airports)


In [None]:
# Summarize null value counts for each delay type
null_summary = flights_data[delay_columns].isnull().sum()
print(null_summary)

# Group by the correct airport column and calculate the percentage of null values
null_percentage_by_airport = flights_data.groupby('origin_iata')[delay_columns].apply(lambda x: x.isnull().mean())
print(null_percentage_by_airport)


Summary of Null Values:

Each delay type has a total of 3,211,353 null values, which indicates that these delays are missing for a large portion of the dataset.

Percentage of Null Values by Airport:

The percentages for each delay type across various airports (identified by origin_iata) are consistent. This means that for each airport, the proportion of missing values for the delay types is very similar.

The consistent high percentage of null values for all delay types across different airports suggests a few possibilities:

Data Reporting Issues:

It’s possible that the dataset does not consistently report certain types of delays. If many flights are missing delay data, it could be that those types of delays are not applicable or not recorded for some flights.
Flight Types:

Certain types of flights (e.g., regional vs. international, or different airlines) may not report delay reasons uniformly. If a significant number of flights are not delayed for reasons recorded in these columns, the null values would be high.
Consistent Data Collection Methods:

If the data collection methods are consistent across all airports, this could also lead to similar null value patterns.

In [None]:
airline_mapping = {
    '9e': 'Envoy Air',
    'aa': 'American Airlines',
    'as': 'Alaska Airlines',
    'b6': 'JetBlue Airways',
    'dl': 'Delta Air Lines',
    'f9': 'Frontier Airlines',
    'g4': 'Allegiant Air',
    'ha': 'Hawaiian Airlines',
    'mq': 'Envoy Air',
    'nk': 'Spirit Airlines',
    'oh': 'Piedmont Airlines',
    'oo': 'SkyWest Airlines',
    'ua': 'United Airlines',
    'wn': 'Southwest Airlines',
    'yx': 'Republic Airways'
}


In [None]:
# Create a function to map codes to airport names
def map_airline_codes(codes):
    return airline_mapping.get(codes, 'Unknown Airline')

# apply function
flights_data['airline_name'] = flights_data['op_unique_carrier'].apply(map_airline_codes)

# View the updated DataFrame
print(flights_data[['op_unique_carrier', 'airline_name']].head())



In [None]:
# Group by airline, origin, and destination to count missing data
missing_data_routes = flights_data[flights_data[delay_columns].isnull().any(axis=1)].groupby(['airline_name', 'origin_iata', 'dest_iata']).size()

# Print the result
print(missing_data_routes)

In [None]:
# Sort results
missing_data_routes_sorted = missing_data_routes.sort_values(ascending=False)
print(missing_data_routes_sorted)

In [None]:
missing_data_routes_sorted.head(10).plot(kind='bar', figsize=(12, 6))
plt.title('Top 10 Routes with Missing Delay Data')
plt.xlabel('Airline, Origin, Destination')
plt.ylabel('Number of Missing Entries')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Total flights per airline
total_flights_per_airline = flights_data.groupby('airline_name').size()

# Missing data analysis
missing_data_analysis = flights_data[flights_data[delay_columns].isnull().any(axis=1)].groupby('airline_name').size()

# Calculate the percentage of missing data
missing_percentage = (missing_data_analysis / total_flights_per_airline) * 100

# Combine the results into a DataFrame for easier viewing
missing_data_summary = pd.DataFrame({
    'Total Flights': total_flights_per_airline,
    'Missing Data': missing_data_analysis,
    'Missing Percentage': missing_percentage
}).fillna(0)  # Fill NaN with 0 for airlines with no missing data

# Print the summary
print(missing_data_summary)

In [None]:
# Investigate relationships further
missing_data_routes = flights_data[flights_data[delay_columns].isnull().any(axis=1)].groupby(['airline_name', 'origin_iata', 'dest_iata']).size()
print(missing_data_routes)


In [None]:
missing_data_analysis.plot(kind='bar', figsize=(12, 6))
plt.title('Missing Delay Data by Airline')
plt.xlabel('Airlines')
plt.ylabel('Number of Missing Entries')
plt.xticks(rotation=45)
plt.show()

Summary of Missing Delay Data by Airline

United Airlines (ua) and Southwest Airlines (wn) have the highest counts of missing delay data, with 339,040 and 638,415 entries missing, respectively.
American Airlines (aa) and Delta Airlines (dl) also show significant numbers of missing data, with 492,133 and 478,777 entries missing.
Other airlines like Alaska Airlines (as) and JetBlue Airways (b6) have comparatively fewer missing entries.

Possible Implications:

Airline Reporting Practices:
Airlines with a high number of missing delay entries may have different reporting practices or may be less consistent in reporting specific delay reasons.

Flight Type Differences:
The airlines with fewer missing entries may have flight routes or operational practices that more consistently record delays.

Potential Data Gaps:
A large number of missing values could indicate that certain flights or routes (especially those operated by the airlines with high missing data) may have delays that are not being reported for some reason.



In [None]:
# investigate if there are specific routes or types of flights associated with these airlines that might also show a pattern in the missing data.

In [None]:
missing_data_routes = flights_data[flights_data[delay_columns].isnull().any(axis=1)].groupby(['airline_name', 'origin_iata', 'dest_iata']).size()
print(missing_data_routes)

In [None]:
# Aggregate missing data by airline and route
missing_data_summary = missing_data_routes.reset_index(name='missing_count')

# Sort by missing count to find the most affected routes
missing_data_summary = missing_data_summary.sort_values(by='missing_count', ascending=False)

# Display the top routes with missing data
print(missing_data_summary.head(20))

weather_data
High Missing Values: snow (64.83%) and snwd (67.21%): Since these 
represent snow-related data, they are considered critical to analysis.
tobs (94.10%): This column has very high missing values, but we are
leaving for now because it represents temperatures observed.
Moderate Missing Values: elevation, prcp, tmax, tmin:
These have around 4-5% missing values. Filling these with the mean or median.


In [None]:
# Identify whether there are specific routes consistently showing
# missing data and see if they correlate with certain characteristics 
# (like flight frequency, carrier performance, etc.).
# Plot top 10 routes with missing data
plt.figure(figsize=(12, 6))
sns.barplot(data=missing_data_summary.head(10), 
            x='missing_count', 
            y='origin_iata', 
            hue='airline_name')
plt.title('Top 10 Routes with Missing Delay Data by Airline')
plt.xlabel('Count of Missing Data')
plt.ylabel('Origin IATA')
plt.legend(title='Airline Name')
plt.show()

In [None]:
# Group by airline and calculate total missing data
airline_missing_counts = missing_data_summary.groupby('airline_name')['missing_count'].sum().reset_index()

# Sort and visualize airline performance regarding missing data
plt.figure(figsize=(12, 6))
sns.barplot(data=airline_missing_counts.sort_values(by='missing_count', ascending=False), 
            x='missing_count', 
            y='airline_name')
plt.title('Total Missing Delay Data by Airline')
plt.xlabel('Total Missing Count')
plt.ylabel('Airline Name')
plt.show()


In [None]:
# Investigate correlations between the missing data and other features, 
# like flight distance or time of year. 

# Merge missing data with other relevant features if available
merged_missing_data = flights_data.merge(missing_data_summary, 
                                          on=['airline_name', 'origin_iata', 'dest_iata'], 
                                          how='left')

# Analyze correlation with flight distance, for instance
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_missing_data, 
                x='distance', 
                y='missing_count')
plt.title('Missing Data Count vs. Flight Distance')
plt.xlabel('Flight Distance')
plt.ylabel('Count of Missing Data')
plt.show()


In [None]:
# Calculate correlation
correlation = merged_missing_data['missing_count'].corr(merged_missing_data['distance'])

print(f'Correlation between missing data count and flight distance: {correlation}')


In [None]:
weather_data.columns

Dive deeper into the specific characteristics of the flights that are missing data, such as:

Flight times: Are these flights mostly at specific times of day?
Days of the week: Are there certain days that show higher rates of missing data?
Airline performance: How do these routes compare to others in terms of delays and cancellations?
This analysis will help you determine whether specific airlines or routes are associated with the missing data and identify any patterns that could inform further investigation or operational improvements.

In [None]:
# Compare data for tmax, tmin, and tobs in weather_data

# Check how many records exist for tmax, tmin, and 
# tobs and their missing values.
print(weather_data[['tmax', 'tmin', 'tobs']].isnull().sum())

# Count non-missing records
non_missing_counts = weather_data[['tmax', 'tmin', 'tobs']].count()
print(non_missing_counts)

In [None]:
# Identify Stations Reporting Patterns

# Group by station (iata) and count reports
station_counts = weather_data.groupby('iata')[['tmax', 'tmin', 'tobs']].count()
print(station_counts)


In [None]:
only_tmax_tmin = station_counts[station_counts['tobs'] == 0]
print(f"Stations reporting only tmax and tmin: {only_tmax_tmin.shape[0]}")

In [None]:
only_tobs = station_counts[station_counts[['tmax', 'tmin']].isnull().any(axis=1)]
print(f"Stations reporting only tobs: {only_tobs.shape[0]}")


In [None]:
# Analyze recording patterns
def recording_pattern(row):
    if pd.notnull(row['tmax']) and pd.notnull(row['tmin']) and pd.isnull(row['tobs']):
        return 'tmax_tmin_only'
    elif pd.isnull(row['tmax']) and pd.isnull(row['tmin']) and pd.notnull(row['tobs']):
        return 'tobs_only'
    elif pd.notnull(row['tmax']) and pd.notnull(row['tmin']) and pd.notnull(row['tobs']):
        return 'all_recorded'
    else:
        return 'other'

weather_data['recording_pattern'] = weather_data[['tmax', 'tmin', 'tobs']].apply(recording_pattern, axis=1)


In [None]:

# count recording patterns
pattern_counts = weather_data['recording_pattern'].value_counts()
print(pattern_counts)


In [None]:
#  bar chart to visualize the distribution of recording patterns

pattern_counts.plot(kind='bar', color='skyblue')
plt.title('Recording Patterns of Weather Stations')
plt.xlabel('Recording Pattern')
plt.ylabel('Number of Stations')
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.boxplot(data=weather_data[weather_data['recording_pattern'] != 'tobs_only'], x='recording_pattern', y='tmax')
plt.title('Comparison of tmax Across Recording Patterns')
plt.show()

sns.boxplot(data=weather_data[weather_data['recording_pattern'] != 'tobs_only'], x='recording_pattern', y='tmin')
plt.title('Comparison of tmin Across Recording Patterns')
plt.show()


In [None]:
# Drop tobs column
weather_data.drop(columns=['tobs'], inplace=True)
print(weather_data.head())

In [None]:
# Standardize column names
flights_data.columns = flights_data.columns.str.lower().str.replace(' ', '_')
weather_data.columns = weather_data.columns.str.lower().str.replace(' ', '_')


In [None]:
# Check how many records exist and their missing values.
print(weather_data.isnull().sum())

# Count non-missing records
non_missing_counts = weather_data.count()
print(non_missing_counts)

In [None]:
# Check how many records exist and their missing values.
print(flights_data.isnull().sum())

# Count non-missing records
non_missing_counts = flights_data.count()
print(non_missing_counts)

In [None]:
print(flights_data.head())

In [None]:
print(weather_data.head())

In [None]:
# Mapping dictionary
day_mapping = {
    1: 'Monday',
    2: 'Tuesday',
    3: 'Wednesday',
    4: 'Thursday',
    5: 'Friday',
    6: 'Saturday',
    7: 'Sunday'
}

# Convert numeric days to day names
flights_data['day_of_week'] = flights_data['day_of_week'].replace(day_mapping)

# Display the updated DataFrame
print(flights_data.head(3))

In [None]:
# Flight Delays: Analyze the distribution of flight delays.
sns.histplot(flights_data['actual_elapsed_time'] - flights_data['crs_elapsed_time'], bins=50)
plt.title('Distribution of Flight Delays')

# Set the limits for the x-axis
plt.xlim(-100, 100)  # Adjust based on the distribution

plt.xlabel('Delay (minutes)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Weather Variables: Analyze temperature and precipitation.

sns.boxplot(x='tmax', data=weather_data)
plt.title('Max Temperature Distribution')
plt.show()

sns.boxplot(x='prcp', data=weather_data)
plt.title('Precipitation Distribution')
plt.show()


In [None]:
# Correlation between delays and weather:
combined_data = flights_data.merge(weather_data, left_on=['date', 'origin_iata'], right_on=['date', 'iata'], how='left')

# Calculate the delay
combined_data['delay'] = combined_data['actual_elapsed_time'] - combined_data['crs_elapsed_time'] 

# Create the scatterplot
sns.scatterplot(x='tmax', y='delay', data=combined_data)
plt.title('Flight Delay vs Max Temperature')
plt.xlabel('Max Temperature (°F)')
plt.ylabel('Delay (minutes)')
plt.show()




In [None]:
# Merge for flight data for origin
origin_data = flights_data.merge(
    weather_data,
    left_on=['date', 'origin_iata'],
    right_on=['date', 'iata'],
    how='left',
    suffixes=('', '_origin')
)

# Merge for flight data for destination
dest_data = flights_data.merge(
    weather_data,
    left_on=['date', 'dest_iata'],
    right_on=['date', 'iata'],
    how='left',
    suffixes=('', '_dest')
)

In [None]:
# Correlation between delays and weather:
combined_data = flights_data.merge(weather_data, left_on=['date', 'dest_iata'], right_on=['date', 'iata'], how='left')

# Calculate the delay
combined_data['delay'] = combined_data['actual_elapsed_time'] - combined_data['crs_elapsed_time'] 

# Create the scatterplot
sns.scatterplot(x='tmax', y='delay', data=combined_data)
plt.title('Flight Delay vs Max Temperature')
plt.xlabel('Max Temperature (°F)')
plt.ylabel('Delay (minutes)')
plt.show()

In [None]:
origin_data.columns

In [None]:
dest_data.columns

In [None]:

# can't merge dest_data and arrival_data due to memory errors
# Function to process and merge data for a range of dates


In [None]:
# Print common columns between origin_data and dest_data
common_columns = set(origin_data.columns) & set(dest_data.columns)
print(common_columns)

In [None]:
print(origin_data.duplicated().sum())
print(dest_data.duplicated().sum())

In [None]:
print(origin_data.columns)
print(origin_data.shape)

In [None]:
print(dest_data.columns)
print(dest_data.shape)

In [None]:
# Identify duplicate columns

# Create a function to identify columns with identical content
def get_duplicate_columns(df1, df2):
    duplicates = []
    for col1 in df1.columns:
        for col2 in df2.columns:
            if df1[col1].equals(df2[col2]):
                duplicates.append((col1, col2))
    return duplicates

# Get the list of duplicate columns
duplicate_columns = get_duplicate_columns(origin_data, dest_data)

# Step 2: Print duplicate columns for verification
print("Duplicate columns based on content:")
for orig_col, dest_col in duplicate_columns:
    print(f"Origin Column: {orig_col} | Destination Column: {dest_col}")


In [None]:
# Calculate delay for origin data
origin_data['delay'] = origin_data['actual_elapsed_time'] - origin_data['crs_elapsed_time']

# Keep only necessary columns: 'date', 'iata', and 'delay'
origin_delay = origin_data[['date', 'iata', 'delay']]

print(origin_delay)


In [None]:

# Calculate delay for destination data
dest_data['delay'] = dest_data['actual_elapsed_time_dest'] - dest_data['crs_elapsed_time_dest']

# Keep only necessary columns: 'date', 'iata', and 'delay'
dest_delay = dest_data[['date', 'iata', 'delay']]


In [None]:
print(dest_delay)


In [None]:
print(f"Length of origin_delay: {len(origin_data['delay'])}")
print(f"Length of dest_delay: {len(dest_data['delay'])}")


In [None]:
# Assuming origin_delay and dest_delay are Series with corresponding indexes
# Create a DataFrame with both delays
delay_data = pd.DataFrame({
    'origin_delay': origin_delay,
    'dest_delay': dest_delay
})

# Now filter the rows where the delays are not equal
mismatched_delays = delay_data[delay_data['origin_delay'] != delay_data['dest_delay']]

# Display the mismatched rows
print(mismatched_delays)


In [None]:
print(flights_data.columns)
print(flights_data.shape)

In [None]:
print(weather_data.columns)
print(weather_data.shape)

In [None]:
# Specify the columns to check for null values
columns_to_fill = ['prcp', 'tmax', 'tmin', 'elevation']

# Fill missing values with the median for each specified column
for col in columns_to_fill:
    median_value = weather_data[col].median()
    weather_data[col] = weather_data[col].fillna(median_value)

# Interpolate remaining null values (if any)
weather_data[columns_to_fill] = weather_data[columns_to_fill].interpolate()

# Print the count of null values in the specified columns
null_counts = weather_data[columns_to_fill].isnull().sum()

print("Null values in specified columns:")
print(null_counts)


In [None]:
weather_data.isnull().sum()

Missing Data in weather_data: If weather_data does not have entries for specific dates and corresponding iata codes, any flight from flights_data that matches those criteria will not find a match, leading to NaN values.

In [None]:
nan_rows = combined_data.isnull()
print(nan_rows)


In [None]:
print(flights_data['origin_iata'].unique())
print(weather_data['iata'].unique())
print(flights_data.shape)
print(weather_data.shape)


It seems there are iata codes in weather_data that do not match the origin_iata codes in flights_data. For instance, if flights_data has 'cvg' as origin_iata but weather_data does not have an entry for 'cvg' on that date, the merge will result in NaN for the iata column in the combined_data for those flights. The shapes indicate that flights_data has significantly more rows (4,009,949) than weather_data (139,012). This implies that many flights may not have corresponding weather data.

In [None]:
nan_rows = combined_data[combined_data['iata'].isnull()]
print(nan_rows[['date', 'origin_iata','dest_iata']])


In [None]:
# Select the specific columns to view
columns_of_interest = ['iata', 'origin_iata', 'dest_iata']
selected_data = combined_data[columns_of_interest]

# Display the selected columns
print(selected_data)

In [None]:
unmatched_iata = set(flights_data['origin_iata'].unique()) - set(weather_data['iata'].unique())
print(unmatched_iata)

In [None]:
missing_dates = set(flights_data['date']) - set(weather_data['date'])
print(missing_dates)

In [None]:
combined_data.head()

In [None]:
# Specify the columns to remove
columns_to_remove = ['iata', 'recording_pattern', 'latitude', 'longitude']

# Create a new DataFrame without the specified columns
cleaned_combined_data = combined_data.drop(columns=columns_to_remove)

# Optionally, check the first few rows of the new DataFrame
print(cleaned_combined_data.head())

In [None]:
cleaned_combined_data.columns

In [None]:
# Save the DataFrame to a CSV file
file_path = r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\cleaned_combined_data_v1.csv'

cleaned_combined_data.to_csv(file_path, index=False)

print(f"Data saved to {file_path}")


In [None]:
print(cleaned_combined_data.columns)

In [None]:
cleaned_combined_data.shape

In [None]:
# Rename to flights_weather_df
flights_weather_df = cleaned_combined_data

In [None]:
# Delays by days of week
# Calculate the delay
flights_weather_df['delay'] = flights_weather_df['actual_elapsed_time'] - flights_weather_df['crs_elapsed_time']

# Delays by days of week
sns.boxplot(x='day_of_week', y='delay', data=flights_weather_df)
plt.title('Flight Delays by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Delay (minutes)')
plt.show()


In [None]:
# Select only numeric columns for correlation
numeric_data = flights_weather_df.select_dtypes(include=[np.number])

# Compute the correlation matrix
corr = numeric_data.corr()

# Heatmap of correlation
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=False, cmap='coolwarm', cbar_kws={"shrink": .8})
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Flight delays by state


# Flight delays by state
avg_delay_by_state = flights_weather_df.groupby('origin_state')['delay'].mean().reset_index()

# Create the bar plot
sns.barplot(x='origin_state', y='delay', data=avg_delay_by_state)
plt.title('Average Flight Delay by State')
plt.xticks(rotation=45)
plt.ylabel('Average Delay (minutes)')
plt.xlabel('Origin State')
plt.show()



In [None]:
# # Flight delays by latitudes and longitudes

# flights_data['date'] = pd.to_datetime(flights_data['date'])

# # Create a season column
# def get_season(month):
#     if month in [12, 1, 2]:
#         return 'Winter'
#     elif month in [3, 4, 5]:
#         return 'Spring'
#     elif month in [6, 7, 8]:
#         return 'Summer'
#     else:
#         return 'Fall'

# flights_data['season'] = flights_data['date'].dt.month.apply(get_season)

# # Combine relevant data
# data_for_correlation = flights_data[['latitude', 'longitude', 'arrival_delay', 'departure_delay', 'season']]

# # Correlation Analysis
# # Calculate correlation coefficients by season
# correlation_results = data_for_correlation.groupby('season').corr().reset_index()

# # Extracting the relevant correlation data
# arrival_corr = correlation_results[correlation_results['level_1'].isin(['arrival_delay', 'latitude', 'longitude'])]
# departure_corr = correlation_results[correlation_results['level_1'].isin(['departure_delay', 'latitude', 'longitude'])]

# # Visualization
# plt.figure(figsize=(14, 6))

# # Scatter plot for Arrival Delay vs Latitude
# plt.subplot(1, 2, 1)
# sns.scatterplot(data=flights_data, x='latitude', y='arrival_delay', hue='season', alpha=0.7)
# plt.title('Arrival Delay vs Latitude by Season')
# plt.axhline(0, color='red', linestyle='--')
# plt.xlabel('Latitude')
# plt.ylabel('Arrival Delay (minutes)')

# # Scatter plot for Departure Delay vs Latitude
# plt.subplot(1, 2, 2)
# sns.scatterplot(data=flights_data, x='latitude', y='departure_delay', hue='season', alpha=0.7)
# plt.title('Departure Delay vs Latitude by Season')
# plt.axhline(0, color='red', linestyle='--')
# plt.xlabel('Latitude')
# plt.ylabel('Departure Delay (minutes)')

# plt.tight_layout()
# plt.show()

# # Print correlation coefficients
# # calculate correlations between delays and latitude/longitude. 
# # The results are grouped by season to see how the relationships change.
# print("Arrival Delay Correlation with Latitude:")
# print(arrival_corr)
# print("\nDeparture Delay Correlation with Latitude:")
# print(departure_corr)

Some seasonal correlations are slightly positive (e.g., Summer Arrival Delay: 0.042, Winter Departure Delay: 0.009668), but these values are still weak.

Similar to latitude, longitude also shows low correlation with delays. Most values are close to zero, indicating no significant relationship.

Explore other factors that might impact delays, such as weather conditions (precipitation, snow), airport traffic, or operational factors (e.g., carrier delays).

Consider using statistical models (like linear regression) to analyze the influence of various factors, including latitude, longitude, and weather, on flight delays.

In [None]:
# Assuming 'date' is in your DataFrame and is in string format
flights_weather_df['date'] = pd.to_datetime(flights_weather_df['date'])

# Define a function to get the season based on the month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'fall'

# Apply the function to create the season column
flights_weather_df['season'] = flights_weather_df['date'].apply(get_season)

# Now sample the data again
sampled_data = flights_weather_df.sample(frac=0.1, random_state=1)

# Scatter plot for Arrival Delay vs Latitude (using sampled data)
plt.subplot(1, 2, 1)
sns.scatterplot(data=sampled_data, x='dest_latitude', y='arrival_delay', hue='season', alpha=0.7)
plt.title('Arrival Delay vs Latitude by Season (Sampled)')
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Latitude')
plt.ylabel('Arrival Delay (minutes)')

# Remove legend
plt.legend([], [], frameon=False)

# Scatter plot for Departure Delay vs Latitude (using sampled data)
plt.subplot(1, 2, 2)
sns.scatterplot(data=sampled_data, x='origin_latitude', y='departure_delay', hue='season', alpha=0.7)
plt.title('Departure Delay vs Latitude by Season (Sampled)')
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Latitude')
plt.ylabel('Departure Delay (minutes)')

# Remove legend
plt.legend([], [], frameon=False)

plt.tight_layout()
plt.show()

In [None]:
# Drop non-numeric columns that aren't needed for correlation
flights_weather_df_numeric = flights_weather_df.select_dtypes(include=[np.number])


In [None]:
correlation_matrix = flights_weather_df_numeric.corr()
print(correlation_matrix[['arrival_delay', 'departure_delay']])


Wheels On & Arrival Delay: The strongest correlation with arrival_delay is with wheels_on (0.276). This indicates that as the wheels-on time increases, arrival delays may also increase.
Departure Delay & Departure Time: There’s a notable positive correlation (0.245) between dep_time and departure_delay, suggesting that later departure times are associated with greater delays.
Arrival Delay and Departure Delay: There’s a moderate correlation (0.156) between arrival_delay and departure_delay, which is expected since delays often propagate through a flight schedule.
Weather Variables: The correlations with weather variables (prcp, snow, snwd, tmax, tmin) are quite low, indicating that weather may not have a significant impact on delays in this dataset.
Other Factors: Variables like crs_dep_time, crs_arr_time, and total_delay_time have negative correlations with delays, which might suggest timing discrepancies play a role.Visualize Relationships: Create scatter plots or pair plots to visualize relationships between key variables, especially those with stronger correlations. This can help identify any non-linear patterns or clusters.

Feature Engineering: Consider creating new features based on the existing ones. For example, you could create interaction terms between departure and arrival times, or consider categorizing times into "early," "on-time," and "late."

Modeling: If you're interested in predicting delays, consider building regression models using arrival_delay and departure_delay as your target variables. Use features with significant correlations as predictors.

Explore Seasonal Effects: Given that delays might vary by season, consider analyzing delays over time, or create a model that includes seasonal effects if you haven’t already.

Handle Missing Values: Check if any of the features with NaN values (like flights) might need imputation or removal from the analysis, as they can affect model performance.

Group Analysis: Consider grouping by other categorical variables like origin_state, dest_state, or carrier to see if certain airlines or routes consistently perform better or worse in terms of delays.

Evaluate Multicollinearity: Since some features might be correlated with each other, it may be worth investigating multicollinearity, especially if you proceed to modeling.

In [None]:
flights_weather_df['season'] = flights_weather_df['date'].dt.month % 12 // 3 + 1
# Mapping months to seasons
season_mapping = {
    1: 'winter',
    2: 'spring',
    3: 'summer',
    4: 'fall'
}
flights_weather_df['season'] = flights_weather_df['season'].map(season_mapping)

In [None]:
seasonal_delays = flights_weather_df.groupby('season')[['arrival_delay', 'departure_delay']].mean()
print(seasonal_delays)


Analysis of Seasonal Delays

Fall: Average arrival delay of about -12.40 minutes and a departure delay of about 2.84 minutes.
Spring: Average arrival delay of about -17.37 minutes, with a slightly higher departure delay.
Summer: The highest average arrival delay of about -24.16 minutes, but the departure delay is lower than in Spring.
Winter: Similar to Fall, with an average arrival delay of about -12.01 minutes and a higher departure delay than Fall.

In [None]:
seasonal_delays.plot(kind='bar', figsize=(10, 6))
plt.title('Average Arrival and Departure Delays by Season')
plt.ylabel('Delay (minutes)')
plt.xticks(rotation=45)
plt.axhline(0, color='red', linestyle='--')
plt.legend(title='Delay Type')
plt.show()

Compare the seasonal delays with other factors such as weather conditions or day of the week to identify any correlations.
Investigate if the delays are statistically significant between seasons using ANOVA or similar statistical tests.
Look into specific flights or routes that tend to have more significant delays in certain seasons.
Consider analyzing potential reasons for the delays, like weather conditions or operational issues during specific times.

In [None]:
# # Print correlation coefficients
# # calculate correlations between delays and latitude/longitude. 
# # The results are grouped by season to see how the relationships change.


In [None]:
flights_weather_df.columns

In [None]:
# Scatter plots show the relationship between elevation and both arrival 
# and departure delays, colored by season. This helps visualize any patterns.

In [None]:
# Function to plot delays vs elevation
def plot_delay_vs_elevation(data):
    plt.figure(figsize=(14, 6))

    # Arrival Delay
    plt.subplot(1, 2, 1)
    sns.scatterplot(data=data, x='elevation', y='arrival_delay', hue='season', alpha=0.7)
    plt.title('Arrival Delay vs Elevation by Season')
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Elevation (feet)')
    plt.ylabel('Arrival Delay (minutes)')

    # Departure Delay
    plt.subplot(1, 2, 2)
    sns.scatterplot(data=data, x='elevation', y='departure_delay', hue='season', alpha=0.7)
    plt.title('Departure Delay vs Elevation by Season')
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Elevation (feet)')
    plt.ylabel('Departure Delay (minutes)')

    plt.tight_layout()
    plt.show()

# Call the plotting function with the correct DataFrame
plot_delay_vs_elevation(flights_weather_df)

# Calculate Correlation Coefficients
elevation_delay_corr = flights_weather_df.groupby('season')[['elevation', 'arrival_delay', 'departure_delay']].corr()
elevation_delay_corr = elevation_delay_corr.reset_index()

# Extracting relevant correlation data
arrival_elevation_corr = elevation_delay_corr[elevation_delay_corr['level_1'] == 'arrival_delay']
departure_elevation_corr = elevation_delay_corr[elevation_delay_corr['level_1'] == 'departure_delay']

# Print correlation coefficients
print("Arrival Delay Correlation with Elevation:")
print(arrival_elevation_corr[['season', 'elevation', 'arrival_delay', 'level_1']])
print("\nDeparture Delay Correlation with Elevation:")
print(departure_elevation_corr[['season', 'elevation', 'departure_delay', 'level_1']])


In [None]:
# Flight delays vs elevation analysis
# Calculate Correlation Coefficients
# Function to plot delays vs elevation
def plot_delay_vs_elevation(data):
    plt.figure(figsize=(14, 6))

    # Arrival Delay
    plt.subplot(1, 2, 1)
    sns.scatterplot(data=data, x='elevation', y='arrival_delay', hue='season', alpha=0.7)
    plt.title('Arrival Delay vs Elevation by Season')
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Elevation (feet)')
    plt.ylabel('Arrival Delay (minutes)')

    # Departure Delay
    plt.subplot(1, 2, 2)
    sns.scatterplot(data=data, x='elevation', y='departure_delay', hue='season', alpha=0.7)
    plt.title('Departure Delay vs Elevation by Season')
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Elevation (feet)')
    plt.ylabel('Departure Delay (minutes)')

    plt.tight_layout()
    plt.show()

# Call the plotting function with the correct DataFrame
plot_delay_vs_elevation(flights_weather_df)

# Calculate Correlation Coefficients
elevation_delay_corr = flights_weather_df.groupby('season')[['elevation', 'arrival_delay', 'departure_delay']].corr()
elevation_delay_corr = elevation_delay_corr.reset_index()

# Extracting relevant correlation data
arrival_elevation_corr = elevation_delay_corr[elevation_delay_corr['level_1'] == 'arrival_delay']
departure_elevation_corr = elevation_delay_corr[elevation_delay_corr['level_1'] == 'departure_delay']

# Print correlation coefficients
print("Arrival Delay Correlation with Elevation:")
print(arrival_elevation_corr[['season', 'elevation', 'arrival_delay', 'level_1']])
print("\nDeparture Delay Correlation with Elevation:")
print(departure_elevation_corr[['season', 'elevation', 'departure_delay', 'level_1']])


In [None]:
# 
# Ensure 'date' column is in datetime format
flights_weather_df['date'] = pd.to_datetime(flights_weather_df['date'], errors='coerce')


In [None]:

# Now, check if the columns exist and rename if necessary
# Extract relevant columns after merging
# Assuming the destination weather columns are suffixed with '_dest'

# Print columns
print("Columns in flights_weather_df:")
print(flights_weather_df.columns)

In [None]:
# Identifying outliers

Q1 = flights_weather_df['total_delay_time'].quantile(0.25)
Q3 = flights_weather_df['total_delay_time'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
outliers = flights_weather_df[(flights_weather_df['total_delay_time'] < lower_bound) | (flights_weather_df['total_delay_time'] > upper_bound)]

plt.figure(figsize=(12, 6))
sns.boxplot(x=flights_data['total_delay_time'])
plt.title('Box Plot of Total Delay Time')
plt.axvline(0, color='red', linestyle='--')  # Line at zero for reference
plt.show()

print("Outliers:")
print(outliers)


In [None]:
# Check the data types of the columns
print(flights_weather_df.dtypes)

For All Flights

Total Entries: Approximately 4 million records.

Total Delay Time: The mean total delay time is negative (-5.32 minutes), indicating that, on average, flights arrived earlier than scheduled.
Arrival and Departure Delays: Both have negative means (-16.59 for arrival delay), showing that many flights may have arrived early.
Delay Variability: The standard deviations for delays are high, particularly for arrival_delay (191.32) and departure_delay (97.96), suggesting significant variability in delays.


For Outliers

Total Entries: Around 190,000 records identified as outliers.
Total Delay Time: The mean total delay time is positive (22.83 minutes), indicating that these flights were delayed on average.
Arrival Delay: The mean arrival delay is negative (-14.49 minutes), suggesting these flights still arrived early despite being categorized as outliers.
Departure Delay: The mean departure delay is positive (13.57 minutes), indicating that these flights were delayed in their takeoff.


Analyzing the Differences

Flight Performance: The fact that outliers have a higher mean total delay time while having early arrival times indicates that these flights likely had significant delays during departure.
Delay Types: The outliers have more pronounced variability in departure_delay, suggesting some flights experienced extreme delays compared to the rest.

Cancellation and Diversion:

Both cancelled and diverted columns show only 0.0, indicating that there are no cancellations or diversions in your dataset. Remove from Dataset.

In [None]:
# Drop the 'cancelled' and 'diverted' columns from the DataFrame
flights_weather_df.drop(columns=['cancelled', 'diverted'], inplace=True)

# Verify that the columns have been removed
print(flights_weather_df.columns)

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='arrival_delay', y='total_delay_time', data=flights_weather_df, alpha=0.6)
plt.scatter(outliers['arrival_delay'], outliers['total_delay_time'], color='red', label='Outliers', alpha=0.7)
plt.axhline(0, color='black', linestyle='--')
plt.title('Total Delay Time vs Arrival Delay (Outliers in Red)')
plt.xlabel('Arrival Delay (minutes)')
plt.ylabel('Total Delay Time (minutes)')
plt.legend()
plt.show()


In [None]:
print("Summary Statistics for All Flights:")
print(flights_weather_df.describe())

print("\nSummary Statistics for Outliers:")
print(outliers.describe())

The Bureau of Transportation Statistics (BTS) typically provides flight data in a standardized format, and the times are generally recorded in Coordinated Universal Time (UTC). This means that all departure and arrival times are expressed in a single time zone, allowing for consistent comparison across different flights and time zones.

There should not be very many negative departure delays.

In [None]:
# Define a function to remove outliers based on IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Columns to check for outliers
delay_columns = ['total_delay_time', 'arrival_delay', 'departure_delay']

# Remove outliers for each column
for column in delay_columns:
    flights_weather_df = remove_outliers(flights_weather_df, column)

# Optionally, check the number of remaining rows
print(f"Remaining rows after outlier removal: {len(flights_weather_df)}")

In [None]:
print("Summary Statistics for All Flights after removal:")
print(flights_weather_df.describe())

print("\nSummary Statistics for Outliers:")
print(outliers.describe())

Summary Statistics for All Flights (Post-Outlier Removal)

Arrival Delay: The mean is approximately -9.43 minutes, indicating that flights tend to arrive slightly early on average.
Departure Delay: The mean departure delay is -2.75 minutes, also suggesting early departures on average.

Weather Variables:
Average precipitation (prcp) is low (0.01298), and snow (snow) is minimal (mean 0.0187).
Maximum temperatures (tmax) and minimum temperatures (tmin) are reasonable, with maximums reaching around 112°F.

Summary Statistics for Outliers

Arrival Delay: The mean arrival delay for outliers is -14.45 minutes, which indicates that these flights tend to arrive even earlier than the overall dataset.
Departure Delay: Outliers show a significant positive mean departure delay of 13.58 minutes, indicating that these flights are more likely to be delayed when departing.

Weather Variables:
Outliers have higher average precipitation (0.02234) compared to the non-outlier flights, which may suggest that adverse weather conditions are affecting these flights.
Notable snow values (mean 0.1297) indicate that these flights might be more impacted by winter weather.

Key Comparisons
Delays: The presence of outliers is associated with higher departure delays, suggesting that these flights may face different operational challenges compared to the majority.
Weather Impact: Weather conditions appear to have a more significant effect on outlier flights, with greater precipitation and snow accumulation.


Further Analysis: Conduct additional analyses to understand the factors leading to delays in outlier flights—particularly looking into operational issues, weather patterns, and airline performance.
Data Visualization: Visualizing these statistics with box plots or histograms can help to better understand the distribution of delays and the impact of weather on flight performance.

In [None]:

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

Key Observations
Arrival Delay Correlations: There are very low correlations with elevation and precipitation. The strongest correlation (0.173481) is with departure delay. The temperature (tmax and tmin) shows a slight negative correlation, indicating that as temperatures rise, delays might slightly decrease.
Departure Delay Correlations: Similar to arrival delays, departure delays have low correlations with most weather-related variables.  The strongest correlation with other factors is also with arrival delays.
Snow and Precipitation: Snow (both at the origin and destination) shows some correlation with delays, particularly in the destination context. This suggests that snow could impact delays, even if the correlation is not very strong.
Elevation: Elevation has a very low correlation with both arrival and departure delays, suggesting that it may not be a significant factor in delay analysis for the dataset.

In [None]:
# Assuming 'flights_weather_df' is your DataFrame and it includes a column for airport identifiers
# Example: 'airport_code' could be the column that identifies airports

# Filter for non-zero snow or snwd values
reported_snow = flights_weather_df[flights_weather_df['snow'] > 0]
reported_snwd = flights_weather_df[flights_weather_df['snwd'] > 0]

# Get unique airport identifiers for both cases
unique_airports_snow = reported_snow['airport_code'].unique()
unique_airports_snwd = reported_snwd['airport_code'].unique()

# Combine unique airports from both lists
all_reported_airports = set(unique_airports_snow).union(set(unique_airports_snwd))

# Count the number of unique airports
num_reported_airports = len(all_reported_airports)

print(f"Number of airports reporting snow or snwd: {num_reported_airports}")


In [None]:
# Descriptive statistics for snow and snwd
snow_stats = flights_weather_df['snow_origin, snow'].describe()
snwd_stats = flights_weather_df['snwd'].describe()

print("Snow statistics:")
print(snow_stats)
print("Snow depth statistics (snwd):")
print(snwd_stats)



In [None]:
plt.figure(figsize=(14, 10))
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
heatmap.set_title('Correlation Matrix of Flight Delays and Weather Variables', fontdict={'fontsize':18}, pad=12)
plt.show()

In [None]:
# Investigate whether a positive departure delay indicates a positive arrival delay
# Visualize the relationship between departure delay and arrival delay
plt.figure(figsize=(10, 6))
sns.scatterplot(data=flights_data, x='departure_delay', y='arrival_delay', alpha=0.6)
plt.title('Departure Delay vs Arrival Delay')
plt.axhline(0, color='red', linestyle='--', label='Zero Arrival Delay')  # Reference line for zero arrival delay
plt.axvline(0, color='blue', linestyle='--', label='Zero Departure Delay')  # Reference line for zero departure delay
plt.xlabel('Departure Delay (minutes)')
plt.ylabel('Arrival Delay (minutes)')
plt.legend()
plt.show()

# Calculate the correlation coefficient
correlation = flights_data['departure_delay'].corr(flights_data['arrival_delay'])
print(f"Correlation between Departure Delay and Arrival Delay: {correlation:.2f}")


While there is a slight tendency for flights that leave late to arrive late, other factors likely play a significant role in determining delays. This could include weather conditions, air traffic, or operational issues. 

Further Analysis:
Time of Day: Analyze if departure and arrival delays vary by time of day. Morning flights might behave differently than evening flights.
Airline Performance: Some airlines may have more consistent schedules than others. Analyze delays by airline.
Route Analysis: Look into specific routes (origin-destination pairs) to see if certain routes experience more delays.
Consider segmenting  analysis by different factors (e.g., season, weather conditions).
Investigate Outliers: Look for flights that have significantly higher or lower delays to identify patterns or anomalies.

A scatter plot of departure delay versus arrival delay could help visualize the relationship. Adding a regression line might clarify the trend.

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=full_data, x='departure_delay', y='arrival_delay', alpha=0.6)
plt.title('Departure Delay vs Arrival Delay')
plt.xlabel('Departure Delay (minutes)')
plt.ylabel('Arrival Delay (minutes)')
plt.axhline(0, color='red', linestyle='--')  # Reference line for zero arrival delay
plt.axvline(0, color='blue', linestyle='--')  # Reference line for zero departure delay
plt.show()


In [None]:
# Cluster Analysis:

from sklearn.cluster import KMeans
import numpy as np

# Prepare the data for clustering
X = full_data[['departure_delay', 'arrival_delay']].dropna()

# Fit K-Means with a specified number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=3)
full_data['cluster'] = kmeans.fit_predict(X)

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=full_data, x='departure_delay', y='arrival_delay', hue='cluster', palette='viridis', alpha=0.6)
plt.title('Clusters of Departure vs Arrival Delays')
plt.xlabel('Departure Delay (minutes)')
plt.ylabel('Arrival Delay (minutes)')
plt.axhline(0, color='red', linestyle='--')
plt.axvline(0, color='blue', linestyle='--')
plt.show()


In [None]:
# Calculate average delays for each cluster
average_delays = full_data.groupby('cluster').agg({
    'departure_delay': 'mean',
    'arrival_delay': 'mean',
    'elevation_origin': 'mean',  # You can add more columns if needed
    'prcp_origin': 'mean',
    'snow_origin': 'mean'
}).reset_index()

print("Average Delays by Cluster:")
print(average_delays)


Interpretation of Results
Cluster 0:

Departure Delay: Approximately 4.8 minutes
Arrival Delay: Approximately 2.9 minutes
Elevation: 176.7 feet
Precipitation: 0.098 inches
Snow: 0.037 inches
This cluster has relatively low average delays for both departure and arrival.

Cluster 1:

Departure Delay: Approximately -87.8 minutes (indicating flights are departing early on average)
Arrival Delay: Approximately -1314.4 minutes (indicating substantial early arrivals, which might need verification or further analysis)
Elevation: 113.4 feet
Precipitation: 0.174 inches
Snow: 0.053 inches
The negative values suggest that flights in this cluster are consistently arriving and departing significantly earlier than expected.

Cluster 2:

Departure Delay: Approximately -69.0 minutes
Arrival Delay: Approximately 1296.9 minutes (also indicating significant early arrivals)
Elevation: 98.0 feet
Precipitation: 0.104 inches
Snow: 0.078 inches
Similar to Cluster 1, this cluster shows a pattern of early departures but also high arrival delays, suggesting a potential outlier situation that warrants closer examination.

Next Steps for Analysis
Investigate Negative Delays:

Check if there are any data entry errors or outliers leading to these extreme negative values. Review the raw data for clusters 1 and 2.
Analyze Common Characteristics:

Continue with the common characteristics analysis to see if certain airlines, routes, or times of day are predominant in these clusters. This could help explain the delays observed.
Visualize Data:

Create visualizations (e.g., box plots or scatter plots) to better understand the distributions of delays within each cluster and identify any patterns.
Correlation Analysis:

Analyze how factors such as weather conditions (precipitation and snow) correlate with delays in each cluster.
Further Segment Analysis:

If any interesting trends emerge from the above analyses, consider further segmenting the data based on these insights (e.g., by specific routes or airlines) to explore delays in more detail.

In [None]:


# Look for common characteristics
common_characteristics = full_data.groupby('cluster').agg({
    'op_unique_carrier': lambda x: x.mode()[0],  # Most common airline in the cluster
    'op_carrier_fl_num': lambda x: x.mode()[0],    # Most common flight number in the cluster
    'dep_time': lambda x: x.mode()[0],  # Most common departure time
    'arr_time': lambda x: x.mode()[0],  # Most common arrival time
    'crs_dep_time': lambda x: x.mode()[0],  # Most common scheduled departure time
    'crs_arr_time': lambda x: x.mode()[0],  # Most common scheduled arrival time
    'actual_elapsed_time': lambda x: x.mode()[0]  # Most common length of elapsed flight times
}).reset_index()

print("\nCommon Characteristics by Cluster:")
print(common_characteristics)

# Merge average delays and common characteristics
cluster_analysis = average_delays.merge(common_characteristics, on='cluster')
print("\nCluster Analysis Summary:")
print(cluster_analysis)



In [None]:

# Merge the average delays and common characteristics
cluster_analysis = average_delays.merge(common_characteristics, on='cluster')
print("\nCluster Analysis Summary:")
print(cluster_analysis)


In [None]:

# Visualize the average delays per cluster
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.barplot(data=average_delays, x='cluster', y='arrival_delay', color='blue', alpha=0.6, label='Arrival Delay')
sns.barplot(data=average_delays, x='cluster', y='departure_delay', color='orange', alpha=0.6, label='Departure Delay')
plt.title('Average Arrival and Departure Delays by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Average Delay (minutes)')
plt.legend()
plt.show()

In [None]:
# can we use the available data to get a look at air traffic in each airport and how that may impact delays? 

In [None]:
# Identify features to use for modeling, considering their correlation with delays.

In [None]:
# arrival_delay
# departure_delay


In [None]:
#Model preparation

#Train-test split
# Assuming full_data is your DataFrame with features and target
X = full_data.drop(columns=['arrival_delay'])  # Features
y = full_data['arrival_delay']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Assuming full_data is your DataFrame with features and target
X = full_data.drop(columns=['arrival_delay'])  # Features
y = full_data['arrival_delay']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [None]:
# Normalize/standardize features as needed
# "Standardization (Z-score Normalization): Centers the feature 
# around 0 with a standard deviation of 1. This is often preferred
# for algorithms like SVM, k-means, and PCA."

# Standardization
scaler = StandardScaler()

# Fit the scaler only on the training data and transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Save scaled data into DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

# Save to CSV if needed
X_train_scaled_df.to_csv('X_train_scaled.csv', index=False)
X_test_scaled_df.to_csv('X_test_scaled.csv', index=False)

In [None]:
# Mean, Sum, Count: For time-based features, calculate statistics over certain time windows.
df['daily_mean_delay'] = df.groupby('date')['arrival_delay'].transform('mean')


In [None]:
### Investigate outliers

# Set up the matplotlib figure
plt.figure(figsize=(12, 6))

# Boxplot for Arrival Delay
plt.subplot(1, 2, 1)
sns.boxplot(x=flights_data['arrival_delay'])
plt.title('Boxplot of Arrival Delay')
plt.xlabel('Arrival Delay (minutes)')

# Boxplot for Departure Delay
plt.subplot(1, 2, 2)
sns.boxplot(x=flights_data['departure_delay'])
plt.title('Boxplot of Departure Delay')
plt.xlabel('Departure Delay (minutes)')

# Boxplot for total time delayed
plt.subplot(1, 2, 2)
sns.boxplot(x=flights_data['total_delay_time'])
plt.title('Boxplot of Total Time Delay')
plt.xlabel('Departure Delay (minutes)')

plt.tight_layout()
plt.show()


In [None]:
# calculate the IQR (Interquartile Range) to find outliers

def detect_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

arrival_bounds = detect_outliers(flights_data['arrival_delay'])
departure_bounds = detect_outliers(flights_data['departure_delay'])

# Filter outliers
arrival_outliers = flights_data[(flights_data['arrival_delay'] < arrival_bounds[0]) | 
                                 (flights_data['arrival_delay'] > arrival_bounds[1])]

departure_outliers = flights_data[(flights_data['departure_delay'] < departure_bounds[0]) | 
                                   (flights_data['departure_delay'] > departure_bounds[1])]

print("Number of arrival delay outliers:", len(arrival_outliers))
print("Number of departure delay outliers:", len(departure_outliers))


In [None]:
#review weather data
weather_data.columns  # Check available columns


In [None]:

# Visualize potential outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x=weather_data['temperature'])  # Replace with relevant column
plt.title('Boxplot of Temperature')
plt.xlabel('Temperature')

plt.show()

Investigate Delay Causes: Further analysis can identify reasons for significant delays, focusing on factors like weather conditions, airport congestion, and airline performance.
Comparative Analysis: Compare delays across different airlines, airports, or time periods to identify patterns and areas for improvement.