In [2]:
#import relevant packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#read the dataset
df = pd.read_csv('airlines_delay.csv')

In [4]:
df

In [5]:
#No. of Delays (Delayed = 1, No Delay = 0)
delay = df['Class'].value_counts()
delay
delay_count = df['Class'].sum()
delay_count

In [6]:
#Finding the percentage of delayed flights 
total_flights = len(df)
total_flights
percentage_delayed_flights = delay_count/total_flights
percentage_delayed_flights

In [8]:
# Bar plot for Delayed vs. Non-Delayed Flights
plt.figure(figsize=(10, 6))
sns.barplot(x=delay.index, y=delay.values, palette='viridis')
plt.title('No. of Delayed vs Non-Delayed Flights')
plt.xlabel('Delayed or Not')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Not Delayed', 'Delayed'])
plt.show()

In [11]:
# Convert minutes past midnight to the hour (truncate)
df['Hour of Departure'] = (df['Time'] // 60).astype(int)
df

In [14]:
# Count plot for delays based on hour of departure
plt.figure(figsize=(12, 6))
sns.countplot(x='Hour of Departure', hue='Class', data=df, palette='viridis')
plt.title('Count of Delays by Hour of Departure')
plt.xlabel('Hour of Departure')
plt.ylabel('Count of Departures')
plt.xticks(rotation=45)
plt.legend(title='Delay', labels=['Not Delayed (0)', 'Delayed (1)'])
plt.show()


In [17]:
# Scatter plot for length of flight and delays
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Length', y='Class', data=df, alpha=0.6)
plt.title('Relationship Between Length of Flight and Delays')
plt.xlabel('Length of Flight (minutes)')
plt.ylabel('Delayed (1) or Not Delayed (0)')
plt.yticks([0, 1], ['Not Delayed', 'Delayed'])
plt.axhline(y=0.5, color='r', linestyle='--', label='Threshold')
plt.legend()
plt.grid(True)
plt.show()

In [20]:
# Create bins for Length of Flight
bins = [0, 60, 120, 180, 240, 300, 360]  # Adjust the bins as needed
labels = ['0-60', '60-120', '120-180', '180-240', '240-300', '300-360']

# Create the Flight Length Category using the updated column name
df['Flight Length Category'] = pd.cut(df['Length'], bins=bins, labels=labels)

# Boxplot for length of flight categories and delays
plt.figure(figsize=(12, 6))
sns.boxplot(x='Flight Length Category', y='Class', data=df, palette='viridis')
plt.title('Delay Distribution by Flight Length Category')
plt.xlabel('Length of Flight Category (minutes)')
plt.ylabel('Delayed (1) or Not Delayed (0)')
plt.yticks([0, 1], ['Not Delayed', 'Delayed'])
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()



In [22]:
# Count the number of flights for each airline
flight_counts = df['Airline'].value_counts()
print(flight_counts)  # This will show you the counts for each airline


In [23]:
# Create a bar plot for the number of flights per airline
plt.figure(figsize=(12, 6))
sns.barplot(x=flight_counts.index, y=flight_counts.values, palette='viridis')
plt.title('Number of Flights by Airline')
plt.xlabel('Airline')
plt.ylabel('Number of Flights')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()


In [24]:
# Calculate the total number of flights and the number of delayed flights for each airline
delay_counts = df.groupby('Airline')['Class'].value_counts().unstack(fill_value=0)

# Calculate the percentage of delays for each airline
delay_percentage = (delay_counts[1] / (delay_counts[0] + delay_counts[1])) * 100
delay_percentage = delay_percentage.sort_values(ascending=False)  # Sort for better visualization

print(delay_percentage)  # This will show you the percentage of delays for each airline


In [25]:
# Create a bar plot for the percentage of delays per airline
plt.figure(figsize=(12, 6))
sns.barplot(x=delay_percentage.index, y=delay_percentage.values, palette='viridis')
plt.title('Percentage of Delays by Airline')
plt.xlabel('Airline')
plt.ylabel('Percentage of Delays (%)')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.ylim(0, 100)  # Set y-axis limits from 0 to 100%
plt.show()

In [26]:
# Count the number of flights from each airport
airport_counts = df['AirportFrom'].value_counts()
print(airport_counts)  # This will show you the counts for each airport


In [27]:
# Create a bar plot for the number of flights from each airport
plt.figure(figsize=(12, 6))
sns.barplot(x=airport_counts.index, y=airport_counts.values, palette='viridis')
plt.title('Number of Flights by Departure Airport')
plt.xlabel('Airport')
plt.ylabel('Number of Flights')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()


In [28]:
# Calculate the number of delays for each airport
airport_delay_counts = df.groupby('AirportFrom')['Class'].value_counts().unstack(fill_value=0)

# Calculate the total number of flights and the number of delayed flights
airport_delay_counts['Total Flights'] = airport_delay_counts[0] + airport_delay_counts[1]
airport_delay_counts['Delay Percentage'] = (airport_delay_counts[1] / airport_delay_counts['Total Flights']) * 100

# Sort by delay percentage for better visualization
airport_delay_counts = airport_delay_counts.sort_values('Delay Percentage', ascending=False)

# Print the resulting DataFrame
print(airport_delay_counts[['Total Flights', 1, 'Delay Percentage']])


In [29]:
# Create a bar plot for the delay percentage per airport
plt.figure(figsize=(12, 6))
sns.barplot(x=airport_delay_counts.index, y=airport_delay_counts['Delay Percentage'], palette='viridis')
plt.title('Percentage of Delays by Departure Airport')
plt.xlabel('Airport')
plt.ylabel('Percentage of Delays (%)')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.ylim(0, 100)  # Set y-axis limits from 0 to 100%
plt.show()


In [30]:
# Count the number of flights for each day of the week
day_counts = df['DayOfWeek'].value_counts().sort_index()

# Print the resulting counts for verification
print(day_counts)

In [31]:
# Create a bar plot for the number of flights for each day of the week
plt.figure(figsize=(12, 6))
sns.barplot(x=day_counts.index, y=day_counts.values, palette='viridis')
plt.title('Number of Flights by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Flights')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()


In [32]:
# Calculate the number of delays for each day of the week
day_delay_counts = df.groupby('DayOfWeek')['Class'].value_counts().unstack(fill_value=0)

# Calculate the total number of flights and the number of delayed flights
day_delay_counts['Total Flights'] = day_delay_counts[0] + day_delay_counts[1]
day_delay_counts['Delay Percentage'] = (day_delay_counts[1] / day_delay_counts['Total Flights']) * 100

# Sort the DataFrame for better visualization
day_delay_counts = day_delay_counts.sort_index()

# Print the resulting DataFrame for verification
print(day_delay_counts[['Total Flights', 1, 'Delay Percentage']])

In [33]:
# Create a bar plot for the delay percentage per day of the week
plt.figure(figsize=(12, 6))
sns.barplot(x=day_delay_counts.index, y=day_delay_counts['Delay Percentage'], palette='viridis')
plt.title('Percentage of Delays by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Percentage of Delays (%)')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.ylim(0, 100)  # Set y-axis limits from 0 to 100%
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()