In [None]:
### Part I

import psycopg2
from psycopg2 import connect

from sqlalchemy import create_engine
from sqlalchemy.engine import URL

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

cnx = psycopg2.connect(   
        user = 'username',
        password = 'password',
        host = 'localhost',
        dbname = 'airlines',
        port = '5432'
)

cursor = cnx.cursor()

url = URL.create(
    "postgresql+pg8000",
    username="username",
    password="password",
    host="localhost",
    database="airlines",
)
engine = create_engine("postgresql+psycopg2://username:password@localhost/airlines")

### downloading data from the database

def read_sql_table(table_name, engine):
    df = pd.read_sql_table(table_name, engine)
    return df

### reading working data fram

flight_df_clear = read_sql_table('flight', engine)

flight_df_raw = flight_df_clear

flight_df_raw.info()

### deleting flights from 2020 and canceled ones

flight_df_raw = flight_df_raw[((flight_df_raw['year'] == 2019) & (flight_df_raw['cancelled'] == 0))]

 ### rename the column 'dep_delay_new' to 'dep_delay'

flight_df_raw = flight_df_raw.rename(columns = {'dep_delay_new': 'dep_delay'})

### descriptive statistics for the 'dep_delay column' and initialize the dataframe 'dep_delay_statistics_df' 

dep_stats = flight_df['dep_delay'].describe()

additional_percentiles = flight_df_raw['dep_delay'].quantile([0.1, 0.9, 0.95, 0.99])

dep_delay_statistics_df = pd.DataFrame({
    'Statistic': ['count', 'mean', 'median', 'std', 'min', 'max'] + 
                 [f'{int(p*100)}%' for p in [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]],
    'Value': [dep_stats['count'], dep_stats['mean'], dep_stats['50%'], dep_stats['std'], dep_stats['min'], dep_stats['max']] + 
             list(flight_df_raw['dep_delay'].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).values)
})

dep_delay_statistics_df['Value'] = dep_delay_statistics_df['Value'].round(2)

dep_delay_statistics_df

### column analysis 'dep_delay'

sns.set(style="whitegrid")

plt.figure(figsize=(12, 6))
hist_plot = sns.histplot( data = flight_df,
            x = flight_df['dep_delay'],
            binwidth = 10,
            color='skyblue',
            edgecolor='black'
)

hist_plot.set_title('Histogram of departure delays', fontsize=16)
hist_plot.set_xlabel('Departure delay [min]', fontsize=14)
hist_plot.set_ylabel('Frequency', fontsize=14)

plt.grid(False)

plt.xlim(left=0)

plt.show()

delay_df = flight_df[flight_df['dep_delay'] > 0]

plt.figure(figsize=(12, 6))
hist_plot = sns.histplot( data = delay_df,
            x = delay_df['dep_delay'],
            binwidth = 10,
            color='skyblue',
            edgecolor='black'
)

hist_plot.set_title('Histogram of departure delays > 0', fontsize=16)
hist_plot.set_xlabel('Departure delay [min]', fontsize=14)
hist_plot.set_ylabel('Frequency', fontsize=14)

plt.grid(False)

plt.xlim(left=0)

plt.show()

delay_df = flight_df[(flight_df['dep_delay'] > 0) & (flight_df['dep_delay'] < flight_df['dep_delay'].quantile(0.95))]

plt.figure(figsize=(12, 6))
hist_plot = sns.histplot( data = delay_df,
            x = delay_df['dep_delay'],
            binwidth = 10,
            color='skyblue',
            edgecolor='black'
)

hist_plot.set_title('Histogram of departure delays > 0 and up to 95th percentile', fontsize=16)
hist_plot.set_xlabel('Departure delay [min]', fontsize=14)
hist_plot.set_ylabel('Frequency', fontsize=14)

plt.grid(False)

plt.xlim(left=0)

plt.show()

### delay analysis

# is_delayed = dep_delay > 15 min
flight_df['is_delayed'] = flight_df['dep_delay'] > 15

count_delayed = flight_df[flight_df['is_delayed'] == True]

delayed_ratio = (count_delayed['is_delayed'].count() / (flight_df['is_delayed'].count())).round(2)

### delays vs calendar month

flight_delays_by_month_df = pd.DataFrame(flight_df.groupby('month')['is_delayed'].count())

plt.figure(figsize=(12, 6))
line_graph = sns.relplot( data = flight_delays_by_month_df,
                         kind = 'line',
                         height=6,
                         aspect=2,
                         marker='o',
                         dashes=False,
                         color='skyblue'                      
                        )

line_graph.set_axis_labels('', 'Delay (minutes)', fontsize=14)
line_graph.fig.suptitle('Monthly flight delays', fontsize=16, weight='bold')

plt.grid(False)
plt.grid(axis='y', linestyle='--', linewidth=0.5)

plt.xlim(left=0, right=13)
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
plt.xticks(ticks=range(1, 13), labels=months)


plt.show()

### delays vs day of the week # 1

flight_delays_by_weekday_df = pd.DataFrame(flight_df.groupby('day_of_week')['is_delayed'].count())

plt.figure(figsize=(12, 6))
line_graph = sns.relplot( data = flight_delays_by_weekday_df,
                         kind = 'line',
                         height=6,
                         aspect=2,
                         marker='o',
                         dashes=False,
                         color='skyblue'                      
                        )

line_graph.set_axis_labels('', 'Delay (minutes)', fontsize=14)
line_graph.fig.suptitle('Flight delays by day of the week', fontsize=16, weight='bold')

plt.grid(False)
plt.grid(axis='y', linestyle='--', linewidth=0.5)

plt.xlim(left=0, right=8)
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
plt.xticks(ticks=range(1, 8), labels=days)


plt.show()

### delays vs day of the week vs weekend # 2

flight_df['is_weekend'] = ((flight_df['day_of_week'] == 6) | (flight_df['day_of_week'] == 7)).astype(int)

flight_delays_by_weekend_df = pd.DataFrame(flight_df.groupby('is_weekend')['is_delayed'].mean()).reset_index().round(2)

plt.figure(figsize=(8, 6)) 


plt.bar(flight_delays_by_weekend_df['is_weekend'], flight_delays_by_weekend_df['is_delayed'], color=['blue', 'green'])


plt.xlabel('') 
plt.ylabel('')
plt.title('Percentage of flight delays by weekend') 

plt.xticks(flight_delays_by_weekend_df['is_weekend'], ['Not_week', 'Week'])

plt.grid(False) 

plt.show() 

### delays vs flight distance

distance_stats = flight_df['distance'].describe()

flight_distance_analysis_df = pd.DataFrame({
    'Statistic': ['count', 'mean', 'median', 'std', 'min', 'max'] + 
                 [f'{int(p*100)}%' for p in [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]],
    'Value': [distance_stats['count'], distance_stats['mean'], distance_stats['50%'], distance_stats['std'], distance_stats['min'], distance_stats['max']] + 
             list(flight_df_raw['distance'].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).values)
}).round(2)

plot for 10,000 random rows from the 'flight_df' table

df = flight_df.sample(n=10000)

plt.figure(figsize=(10, 6))
plt.scatter(df['distance'], df['dep_delay'], alpha=0.5)

plt.title('Distance by delay')
plt.xlabel('Distance')
plt.ylabel('Delay')

plt.grid(True)

plt.show()

flight_df = flight_df[flight_df['distance'] <= flight_df['distance'].quantile(0.95)]

 determining delay percentages

flight_df['distance_agg'] = (flight_df['distance'] // 100) * 100

grouped = flight_df.groupby('distance_agg')['is_delayed'].sum().reset_index()

total_delays = grouped['is_delayed'].sum()
grouped['delay_percentage'] = (grouped['is_delayed'] / total_delays).round(2)

grouped.rename(columns={'is_delayed': 'total_delays'}, inplace=True)

flight_delays_by_distance_agg_df = grouped[['distance_agg', 'delay_percentage']]

plt.figure(figsize=(15, 8))
sns.barplot(x='distance_agg', y='delay_percentage', data=flight_delays_by_distance_agg_df, palette='viridis')


plt.xlabel('Distance [miles]', fontsize=14)
plt.ylabel('Percentage of delayed flights [%]', fontsize=14)
plt.title('Percentage of delayed flights by distance', fontsize=16)
plt.xticks(rotation=90)


plt.show()

### delays vs distance group

distance_grouped_max = pd.DataFrame(flight_df.groupby('distance_group')['distance'].max().reset_index())
distance_grouped_min = flight_df.groupby('distance_group')['distance'].min().reset_index()

flight_distance_by_distance_group = pd.merge(distance_grouped_max, distance_grouped_min, on = 'distance_group', how = 'left')
flight_distance_by_distance_group.rename(columns={'distance_x' : 'max', 'distance_y' : 'min'}, inplace=True)
flight_distance_by_distance_group

distance_group by delay_percentage

distance_grouped = flight_df.groupby('distance_group')['is_delayed'].sum().reset_index()

total_distance_delays = distance_grouped['is_delayed'].sum()
distance_grouped['delay_percentage'] = (distance_grouped['is_delayed'] / total_distance_delays).round(2)

distance_grouped.rename(columns={'is_delayed': 'total_delays'}, inplace=True)

flight_delays_by_distance_group_df = distance_grouped[['distance_group', 'delay_percentage']]

plt.figure(figsize=(10, 8))
sns.barplot(x='distance_group', y='delay_percentage', data=flight_delays_by_distance_group_df, palette='viridis')


plt.xlabel('Distance group', fontsize=14)
plt.ylabel('Percentage of delayed flights [%]', fontsize=14)
plt.title('Percentage of delayed flights by distance group', fontsize=16)
plt.xticks(rotation=90)


plt.show()


flight_df.to_csv(r"file_place\flight_df_01.csv", index=False)