In [1]:
import dask.dataframe as dd
from datetime import datetime, timedelta
import pandas as pd
import os.path
import pathlib 
import seaborn as sns
from collections import namedtuple
import matplotlib.pyplot as plt
from textwrap import wrap
from matplotlib.ticker import FuncFormatter



Result = namedtuple('Result', 'desc date var values')

In [4]:
# file = 'some.csv'
file = 'full.csv'
required_cols = ['FL_DATE','DISTANCE','DEP_DELAY','ARR_DELAY','CANCELLED']

In [11]:
events_file = 'events_stripped.csv'
# events_file = 'event.csv'
dateparse = lambda x: datetime.strptime(x, '%d.%m.%Y')
events = pd.read_csv(events_file, sep=';', names=['date', 'description'], parse_dates=['date'], date_parser=dateparse)
# len(events)

In [6]:
delta = timedelta(days=7)

In [7]:
df = dd.read_csv(file, usecols=required_cols, parse_dates = ['FL_DATE'])

In [8]:
def human_format(num, pos):
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    # add more suffixes if you need them
    return '%.0f%s' % (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])


def get_plot(setHumanFormat = False):
    fig, ax = plt.subplots()
    if setHumanFormat:
        ax.yaxis.set_major_formatter(FuncFormatter(human_format))
    plt.xticks(rotation=45)
    
    return fig, ax

In [None]:
delay_results = []
# cancelled_results = []
for index, event in events.iterrows():
    before, after = event.date - delta, event.date + delta
    event_scope_df = df[(df.FL_DATE >= before) & (df.FL_DATE <= after) & (df.DEP_DELAY > 0)]
    
    group = event_scope_df.groupby(event_scope_df.FL_DATE)
    size = group.DEP_DELAY.count()
    
    delays = group.DEP_DELAY.sum() / size
    delay_results.append(Result(desc=event.description, date=event.date, var='delay', values=delays))
    
#     cancells = group.CANCELLED.sum() / group.CANCELLED.count()
#     cancelled_results.append(Result(desc=event.description, date=event.date, var='cancells', values=cancells))



In [None]:
for result in delay_results:
    fig, ax = get_plot(True)
    
    ax.plot(list(result.values.index), list(result.values))
    ax.axvline(result.date, color='red', label='data wydarzenia')
    
    plt.title('\n'.join(wrap(result.desc, 60)))
    plt.xlabel('Dzień')
    plt.ylabel('Suma minut opóźnień na lot')
    ax.legend()

    plt.savefig(f'plots/{result.date.date()}_delay_sum.png', bbox_inches='tight')
    plt.show()
    print(result.date.date())
    plt.close()

In [12]:
cancelled_results = []
for index, event in events.iterrows():
    before, after = event.date - delta, event.date + delta
    event_scope_df = df[(df.FL_DATE >= before) & (df.FL_DATE <= after)]
    
    group = event_scope_df.groupby(event_scope_df.FL_DATE)
    size = group.DEP_DELAY.count()
    
    cancells = group.CANCELLED.sum() / size
    cancelled_results.append(Result(desc=event.description, date=event.date, var='cancells', values=cancells))

In [None]:
for result in cancelled_results:
    fig, ax = get_plot()
    
    ax.plot(list(result.values.index), list(result.values))
    ax.axvline(result.date, color='red', label='data wydarzenia')
    
    plt.title('\n'.join(wrap(result.desc, 60)))
    plt.xlabel('Dzień')
    plt.ylabel('Współczynnik odwołań')
    ax.legend()
    plt.savefig(f'plots/cancells/{result.date.date()}_cancells.png', bbox_inches='tight')
    print(result.date.date())
    plt.close()