In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/JackyP/testing/master/datasets/nycflights.csv", index_col=0)
df["data"] = pd.to_datetime(df[['year', 'month', 'day']]) 

# Coluna que indica atraso ou não no voo
df['isdelay'] = (df['arr_delay'] > 0).astype(int)


Adiciona a coluna de estação


In [0]:
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

df['season'] = df['data'].apply(get_season)

Adiciona a coluna de rota

In [0]:
df['route'] = df['origin'] + '-' + df['dest']

In [0]:
df.head()

In [0]:
summary = {
    'total': len(df),
    'isdelay_1': (df['isdelay'] == 1).sum(),
    'isdelay_0': (df['isdelay'] == 0).sum()
}
display(pd.DataFrame([summary]))

In [0]:
grouped = df.groupby('route')['isdelay'].agg(
    delay=lambda x: (x == 1).sum(),
    no_delay=lambda x: (x == 0).sum(),
    total='count'
).reset_index()

grouped = grouped.sort_values(by='delay', ascending=False)

display(grouped)

In [0]:
grouped = df.groupby('carrier')['isdelay'].agg(
    delay=lambda x: (x == 1).sum(),
    no_delay=lambda x: (x == 0).sum(),
    total='count'
).reset_index()

grouped = grouped.sort_values(by='delay', ascending=False)

display(grouped)

Voos e atrasos por estações


In [0]:
df_season_delay = df.groupby(['season', 'isdelay']).size().unstack(fill_value=0)
df_season_delay['% atraso'] = (df_season_delay[1] / df_season_delay.sum(axis=1) * 100).round(2)

df_season_delay.plot(
    kind='bar',
    stacked=True,
    figsize=(10,6),
    color=sns.color_palette("Set2", 2)
)
for idx, row in enumerate(df_season_delay['% atraso']):
    plt.text(
        idx, 
        df_season_delay.loc[df_season_delay.index[idx], 0] + df_season_delay.loc[df_season_delay.index[idx], 1] + 2,
        f"{row}%",
        ha='center',
        fontsize=10
    )
plt.xlabel('Season')
plt.ylabel('Quantidade de voos')
plt.title('Quantidade de voos por estação (Atrasados e Não Atrasados)')
plt.legend(['Não atrasado', 'Atrasado'])
plt.tight_layout()
plt.show()

Gráfico de aeronaves que fazem mais voos


In [0]:
df_delay_counts = df.groupby(['tailnum', 'isdelay']).size().unstack(fill_value=0)
df_delay_counts['total'] = df_delay_counts.sum(axis=1)
df_delay_counts = df_delay_counts.sort_values(by='total', ascending=False).head(20)
df_delay_counts = df_delay_counts.drop(columns='total')

sns.set(style="whitegrid")
df_delay_counts.plot(kind='bar', stacked=True, figsize=(12,6), color=sns.color_palette("Set2", 2))
plt.xlabel('tailnum')
plt.ylabel('Quantidade de voos')
plt.title('Quantidade de voos por tailnum (Atrasados e Não Atrasados)')
plt.legend(['Não atrasado', 'Atrasado'])
plt.tight_layout()
plt.show()

Voos atrasados por destinos

In [0]:
df_dest = df[df['isdelay'] == 1].groupby('dest').size()
df_dest = df_dest.sort_values(ascending=False)
top30 = df_dest.head(30)
other = df_dest.iloc[30:].sum()
df_dest_top = top30.copy()
df_dest_top['Other'] = other

ax = df_dest_top.plot(
    kind='bar',
    figsize=(12,6),
    color=sns.color_palette("Set2", 1)
)
plt.xlabel('Destino')
plt.ylabel('Quantidade de voos atrasados')
plt.title('Quantidade de voos atrasados por destino (Top 30 + Outros)')
plt.legend(['Atrasado'])
plt.tight_layout()
plt.show()

In [0]:
display(df.dtypes)

Voos e atrasos por mêses

In [0]:
df.head()

In [0]:
df_dest_delay = df[df['isdelay'] == 1].groupby('dest').size().sort_values(ascending=False).head(10)

display(df_dest_delay)

In [0]:
filtered_carriers = ['EV', 'B6', 'UA', 'DL', 'MQ']
df_filtered = df[df['carrier'].isin(filtered_carriers)]

top15_dest = df_filtered[df_filtered['isdelay'] == 1].groupby('dest').size().sort_values(ascending=False).head(15).index.tolist()
df_top15 = df_filtered[df_filtered['dest'].isin(top15_dest)]

heatmap_data = df_top15.groupby(['dest', 'season'])['isdelay'].sum().unstack(fill_value=0)

plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Heatmap de atrasos por destino e estação do ano (Top 15 destinos) - Carriers EV, B6, UA, DL, MQ')
plt.xlabel('Estação do Ano')
plt.ylabel('Destino')
plt.tight_layout()
plt.show()

In [0]:
filtered_carriers = ['EV', 'B6', 'UA', 'DL', 'MQ']
top20_dest = df[df['isdelay'] == 1].groupby('dest').size().sort_values(ascending=False).head(20).index.tolist()
df_top20 = df[df['dest'].isin(top20_dest) & df['carrier'].isin(filtered_carriers)]

heatmap_carrier_data = df_top20.groupby(['dest', 'carrier'])['isdelay'].sum().unstack(fill_value=0)

plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_carrier_data, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Heatmap de atrasos por destino e empresa (Top 20 destinos, carriers EV, B6, UA, DL, MQ)')
plt.xlabel('Empresa (Carrier)')
plt.ylabel('Destino')
plt.tight_layout()
plt.show()