# Top 5s
## Top 5 aerolineas

In [None]:
# asume que los datos estan en ../data/<año>.csv.bz2
# fixme: poner el directorio de data que esté en el repo cuando pusheen los pibes

import seaborn as sns
import matplotlib.pyplot as plt

import fnmatch

import pandas as pd
import time
import os
import shutil

import math
import numpy as np

from sklearn.metrics import mean_squared_error

#sns.set(color_codes=True)

# load data
tmp_dir = "./tmp_csv/"
years = [str(year) for year in range(1994,2009)]

# CML

In [None]:
max_grade = 1
phases = [i*math.pi/2 for i in range(4)]
monthly_freqs =  [3, 4, 6, 12]

class lsqPredictor:
    def __init__(self, phases, freqs, max_grade):
        self.phases = phases
        self.freqs = freqs
        self.max_grade = max_grade
    
    def trig_vals(self, t):
        return  [math.sin(2 * math.pi / f * t + p) for f in self.freqs for p in self.phases]

    def get_x_vals(self, x):
        x_vals = []
        for i in range(len(x)):
            row = np.array([i**p for p in range(self.max_grade+1)] + self.trig_vals(i), dtype='float')
            x_vals.append(row)
        return x_vals
    
    def print_coefs(self):
        c = 0
        for i in range(self.max_grade+1):
            print("Coef de x^{}:\n{}".format(i, self.coefs[c]))
            c += 1
        for i in self.freqs:
            for j in self.phases:
                print("Coef de sinusoide con frecuencia {} y fase {}:\n{}".format(i, j, self.coefs[c]))
                c += 1
                
    def fit(self, train_set):
        arrays = self.get_x_vals(train_set)
        A = np.stack(arrays)
        self.coefs = np.linalg.solve(A.T@A, A.T@train_set)
    
    def pred(self, pred_set):
        arrays = self.get_x_vals(pred_set)
        full_A = np.stack(arrays)        
        return full_A@self.coefs 

## Procesar datos

In [None]:
years = [str(year) for year in range(1994,2000)]
def get_top_carriers():    
    df_acc = pd.DataFrame(columns=["FlightNum", "UniqueCarrier"]).set_index("UniqueCarrier")
    for year in years:
        print("Leyendo csv: {}".format(year))
        df = pd.read_csv("../data/"+year+".csv.bz2", compression="bz2", \
                         usecols=["FlightNum", "UniqueCarrier"], \
                         encoding="ISO 8859-1")    


        df = df.groupby(by="UniqueCarrier").count()
        df_acc = pd.concat([df_acc, df]).groupby("UniqueCarrier").sum()
        del df 
    top = df_acc.nlargest(5, 'FlightNum')
    return top

df = get_top_carriers()
df.to_csv(tmp_dir+"carriers/top_carriers.csv")
df

In [None]:
df.to_csv(tmp_dir+"carriers/top_carriers.csv")

In [None]:
df = pd.read_csv(tmp_dir+"carriers/top_carriers.csv")

In [None]:
top_carriers = df.reset_index()["UniqueCarrier"]
top_carriers

### Predecir

In [None]:
#crear directorio con data de delays agrupada por retraso 

# frequency = 'D' (diario) o 'M' (mensual)
def create_grouped_files(subdir, years, carriers):
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    if not os.path.exists(tmp_dir + subdir):
        os.makedirs(tmp_dir + subdir)
    
    df_cut = pd.DataFrame(columns=["date", "UniqueCarrier", "ArrDelay"])
    df_full = pd.DataFrame(columns=["date", "UniqueCarrier", "ArrDelay"])
    for year in years:
        print("Leyendo csv: {}".format(year))
        df = pd.read_csv("../data/"+year+".csv.bz2", compression="bz2", \
                         usecols=["Month", "Year", "DayofMonth", "DayOfWeek", "ArrDelay", "UniqueCarrier"], \
                         encoding="ISO 8859-1")    

        #solo los del carrier seleccionado
        df = df.loc[df["UniqueCarrier"].isin(carriers)]

        #formato mas feliz para fechas
        dates = pd.to_datetime(df.Year*10000+df.Month*100+df.DayofMonth, format='%Y%m%d')
        df["date"] = dates

        #acumulamos en full antes de recortar outliers
        df = df[["date", "ArrDelay", "UniqueCarrier"]]
        dg = df.groupby(
                [pd.Grouper(key='date', freq='M'), pd.Grouper(key='UniqueCarrier')] 
            ).mean().reset_index()

        df_full = pd.concat([df_full, dg], sort=False)
        
        #sacamos outliers
        low = 0.1
        high = 0.9

        group = [pd.Grouper(key='date', freq='M'), pd.Grouper(key='UniqueCarrier')]
        df = df.groupby(group) \
                .apply(lambda x : 
                  x[(x.ArrDelay >= x.ArrDelay.quantile(low)) & 
                    (x.ArrDelay <= x.ArrDelay.quantile(high))]
              .mean()
              ).reset_index()
        
        if (year==1994):
            print(df)
        
        df_cut = pd.concat([df_cut, df], sort=False)
        del df 

    df_cut.to_csv(tmp_dir+subdir+"cut"+".csv")
    df_full.to_csv(tmp_dir+subdir+"full"+".csv")

years = [str(year) for year in range(1994, 2000)]
create_grouped_files("carriers/", years, top_carriers)

In [None]:
def plot_df(df1, df2, train_limit_axis, unit_str):
    plt.figure(figsize=(16,5))
    plt.xlabel('Predicción para el i-ésimo {}'.format(unit_str))
    
    ax1 = df1.dl_pred.plot(color='orange', grid=True, label='pred delay')
    df2.ArrDelay.plot(color='blue', grid=True, secondary_y=False, label='real delay', ax=ax1)

    xticks = ax1.xaxis.get_major_ticks()
    print(len(xticks))
    plt.axvline(x=train_limit_axis, color="green")
    plt.show()

for carrier in top_carriers:
    df = pd.read_csv(tmp_dir+"carriers/cut.csv")
    df = df[df["UniqueCarrier"] == carrier]
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    df_full = pd.read_csv(tmp_dir+"carriers/full.csv")
    df_full = df_full[df_full["UniqueCarrier"] == carrier]
    df_full['date'] = pd.to_datetime(df_full['date'], errors='coerce')
    
    df.sort_values(by="date")
    year_train_limit = 1997
    lower_year_train_limit = 1993
    train_delays = df.loc[(df["date"].dt.year < year_train_limit) & \
                          (df["date"].dt.year > lower_year_train_limit)]["ArrDelay"]

    lpr = lsqPredictor(phases, monthly_freqs, max_grade)
    lpr.fit(train_delays)
    #lpr.print_coefs()
    
    train_test_delays = df["ArrDelay"]
    df["dl_pred"] = lpr.pred(train_test_delays)
    plot_df(df.reset_index(), df_full.reset_index(), len(train_delays), "mes")

    error = math.sqrt(
        mean_squared_error(
            df.loc[df["date"].dt.year > year_train_limit].dl_pred, 
            df_full.loc[df_full["date"].dt.year > year_train_limit].ArrDelay
        )
    )
    print(error)
    del df
    del df_full