In [None]:
# self paced learning 
import pandas as pd 


df = pd.read_csv('../data/Flight_Delays_2024.csv')

print("first 5 rows of this dataset: ")
print(df.head())

print("\n This is the shape of this dataset: "+ "="*20)
print (df.shape)
print("\n This is the distribution of this dataset: "+ "="*20)
print(df.info()) 
print("\n Some descriptive statistics that could be used: " + "+"*20)
print(df.describe())


print("\n checking for null or missing values: ")
print(df.isnull().sum())

print("\n checking for categorical values: ")
print(df['Airline'].value_counts())
print(df['DelayCategory'].value_counts())

In [None]:
# handling missing values 
# 1 check for missing vaulues 
print(df.isnull().sum()) 

# we have 2 cases: -1, remove them ; -2 fill them by using imputation (fill missing values by mean, median or mode of that column)
# if there is, in our case we dont have them, but if we had them, 

# we can 1: drop them, then check the new shape of the cleaned dataframe: 
df_cleaned= df.dropna() 

print("\nnew shape of dataframe after dropping missing values: ")
print(df_cleaned.shape)

print(df['DelayMinutes'].skew())
print(df['DelayMinutes'].kurtosis())


In [None]:
# visualisation and ploting 
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('ticks')
sns.histplot(data=df, x='DelayMinutes', kde=True)
plt.title('Distribution of Flight Delay Minutes')
plt.show()

sns.set_style('ticks')
sns.boxplot(data=df, x='DelayMinutes')
plt.title('Boxplot of Flight Delay Minutes')
plt.show()

sns.set_style('darkgrid')
sns.countplot(data=df, x='DelayCategory')
plt.xticks(rotation=45)
plt.title('Count of Delay Categories')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a boxplot to compare delay distributions across airlines
plt.figure(figsize=(12, 6)) # Make the plot wider to fit airline names
sns.boxplot(data=df, x='Airline', y='DelayMinutes')
plt.title('Flight Delay Distribution by Airline')
plt.xticks(rotation=45) # Rotate names if they overlap
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Create a bar plot to show average delay by day of the week
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='DayOfWeek', y='DelayMinutes', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Average Flight Delay by Day of the Week')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a scatter plot to see the relationship between distance and delay
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='FlightDistanceKM', y='DelayMinutes')
plt.title('Flight Delay vs. Flight Distance')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='DelayMinutes', y='Origin')
plt.title('Boxplot of Flight Delay Minutes by Origin Airport')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1 we create crosstab then we do heatmap 
# withcrosstab we keep occurence of each combination 
airline_category_crosstab= pd.crosstab(df['Airline'], df['DelayCategory'])
print("the occurrence of each combination is: ")
print(airline_category_crosstab)

plt.figure(figsize=(10, 6))
sns.heatmap(airline_category_crosstab, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Heatmap of Airline vs Delay Category')
plt.xlabel('Delay Category')
plt.ylabel('Airline')   
plt.show()


In [None]:
# Create a bar plot of DayOfWeek vs. DelayMinutes but add hue='Airline'
import pandas as pd

df['isWeekend']= df['DayOfWeek'].apply(lambda day: 1 if day in ['Saturday', 'Sunday'] else 0)
print(df[['DayOfWeek', 'isWeekend']].head())
print(df['isWeekend'].value_counts())

In [None]:
import pandas as pd

df['IsWeekend']= df['DayOfWeek'].isin(['Saturday', 'Sunday']).astype(int)
print(df[['DayOfWeek', 'IsWeekend']].head())
print(df['IsWeekend'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,8))
sns.boxplot(data=df, x='Airline', y='DelayMinutes', hue='IsWeekend', dodge=True)
plt.title('Boxplot of Delay Minutes by Airline and Weekend Status')
plt.xticks(rotation=45)
plt.show()