# 1. Read in the Data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

In [3]:
data = pd.read_csv("data/merged_airlines.csv", index_col=0)

# 2. Understanding the Data

In [4]:
data.head()

Unnamed: 0,creation_time,id,airline_code,flight_number,flight_date,departure_airport,user_name,action_name,START_WI weight,START_WI index,...,ESTIMATED_TRAFFIC_LOAD,ESTIMATED_ZFW,DELTA_ZFW,ZFW_TOLERANCE_EXCEEDED,Total bag weight,airport_name,city,region,country_name,continent
0,2024-04-30 04:01:00,33766922,AB,2373,30,IXB,service-acco,CheckinMsgProcessor,,,...,,,,,,Bagdogra Airport,Siliguri,IN-WB,India,AS
1,2024-04-30 04:01:00,33766923,AB,2373,30,IXB,service-acco,StorePaxDataAction,,,...,,,,,,Bagdogra Airport,Siliguri,IN-WB,India,AS
2,2024-04-30 04:01:00,33766924,AB,2373,30,IXB,service-acco,CalculateWeightAndTrimAction,,,...,,,,,,Bagdogra Airport,Siliguri,IN-WB,India,AS
3,2024-04-30 04:01:00,33766925,AB,2373,30,IXB,service-acco,CalculateWeightAndTrimAction,44296.0,38.53,...,15915.44,60710.44,,,,Bagdogra Airport,Siliguri,IN-WB,India,AS
4,2024-04-30 04:01:00,33766926,AB,2373,30,IXB,service-acco,StorePaxDataAction,,,...,,,,,1571.0,Bagdogra Airport,Siliguri,IN-WB,India,AS


In [5]:
data.tail()

Unnamed: 0,creation_time,id,airline_code,flight_number,flight_date,departure_airport,user_name,action_name,START_WI weight,START_WI index,...,ESTIMATED_TRAFFIC_LOAD,ESTIMATED_ZFW,DELTA_ZFW,ZFW_TOLERANCE_EXCEEDED,Total bag weight,airport_name,city,region,country_name,continent
429750,2024-05-07 09:20:24,375567187,ZY,3616,7,VCP,service-acco,CalculateWeightAndTrimAction,29275.0,64.59,...,,30216.0,0.0,,,Viracopos International Airport,Campinas,BR-SP,Brazil,SA
429751,2024-05-07 09:20:24,375567248,ZY,3616,7,VCP,service-acco,CreateBaggageLoadItemsAction,,,...,,,,,,Viracopos International Airport,Campinas,BR-SP,Brazil,SA
429752,2024-05-07 09:20:24,375567249,ZY,3616,7,VCP,service-acco,CalculateWeightAndTrimAction,29275.0,64.59,...,,30180.0,0.0,,,Viracopos International Airport,Campinas,BR-SP,Brazil,SA
429753,2024-05-07 09:20:24,375567250,ZY,3616,7,VCP,service-acco,StorePaxDataAction,,,...,,,,,0.0,Viracopos International Airport,Campinas,BR-SP,Brazil,SA
429754,2024-05-07 09:20:24,375567251,ZY,3616,7,VCP,service-acco,PAXBOOKINGINMsgProcessor,,,...,,,,,,Viracopos International Airport,Campinas,BR-SP,Brazil,SA


In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data["airline_code"].unique()

In [None]:
data["flight_number"].unique()

In [None]:
data["departure_airport"].unique()

In [None]:
data["action_name"].unique()

In [None]:
data["country_name"].unique()

In [None]:
data["city"].unique()

In [None]:
data["airport_name"].unique()

## Data Visualizations

In [None]:
# Plot the distribution of departure airports
plt.figure(figsize=(12, 6))
sns.countplot(y='departure_airport', data=data, order=data['departure_airport'].value_counts().index)
plt.title('Distribution of Actions by Departure Airport')
plt.ylabel('Departure Airport')
plt.xlabel('Number of Actions')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Identify the top 5 most occurring actions
top_actions = data['action_name'].value_counts().nlargest(5).index

# Filter data_cleaned to include only the top 5 actions
filtered_data = data[data['action_name'].isin(top_actions)]

# Resample data to day and sum the action counts
action_counts = filtered_data.groupby([filtered_data.index.date, 'action_name']).size().unstack(fill_value=0)

# Plot number of actions per day for the top 5 actions
plt.figure(figsize=(12, 6))
action_counts.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Number of Top 5 Actions Per Day')
plt.xlabel('Day')
plt.ylabel('Count')
plt.legend(title='Action Name')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Count the occurrences of each user_name
user_counts = data['user_name'].value_counts()

# Plot pie chart
plt.figure(figsize=(5, 5))
plt.pie(user_counts, labels=user_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('Distribution of Actions by User Name')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Identify the top 5 most occurring actions
top_actions = data['action_name'].value_counts().nlargest(5).index

# Filter data_cleaned to include only the top 5 actions
filtered_data = data[data['action_name'].isin(top_actions)]

# Plot actions per user for the top 5 actions
plt.figure(figsize=(12, 6))
sns.countplot(y='user_name', hue='action_name', data=filtered_data)
plt.title('Actions Per User (Top 5 Actions)')
plt.xlabel('Count')
plt.ylabel('User Name')
plt.legend(title='Action Name')
plt.show()

In [None]:
# Filter for two specific departure airports
filtered_airports = ['DUS', 'JFK']
df_filtered = data[data['departure_airport'].isin(filtered_airports)]

# Group by departure_airport and action_name, then count occurrences
action_counts = df_filtered.groupby(['departure_airport', 'action_name']).size().reset_index(name='count')

# Sort the counts in descending order and select the top 5 actions per departure_airport
top_actions = action_counts.sort_values(['departure_airport', 'count'], ascending=[True, False]).groupby('departure_airport').head(5)

# Visualize the top 5 actions per departure_airport
plt.figure(figsize=(12, 8))
sns.barplot(x='count', y='action_name', hue='departure_airport', data=top_actions, dodge=True)
plt.title('Top 5 Actions per Departure Airport')
plt.xlabel('Number of Actions')
plt.ylabel('Action Name')
plt.legend(title='Departure Airport', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [6]:
from ydata_profiling import ProfileReport
profile = ProfileReport(data, title="Profiling Report")

In [7]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]