<a href="https://colab.research.google.com/github/lgolab/Fine-tuning-data-dependencies/blob/main/codes/MFD_Tableaux_FlightDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Data Preparation

### Importing Libraries

In [27]:
import csv
from datetime import datetime
from tqdm import tqdm
from itertools import chain, combinations

### Moving data to colab

In [28]:
%cd /content/
!cp -r /content/drive/MyDrive/FD-project/flight/ /content/flight
%cd /content/flight/

/content
/content/flight


### Reading the data and Creating the dataset

In [29]:
dataset_address = 'August2018-Nationwide-flights.csv'

attrs = ['FL_DATE', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM',
         'ORIGIN', 'DEST',
         'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID', 
         'DEP_DELAY', 'ARR_DELAY']
attrIndex = []

dataset = []
with open(dataset_address) as csvfile:    
    reader = csv.reader(csvfile, delimiter=',')
    
    cnt = 0
    
    for row in reader:
        if cnt == 0:
            for attr in attrs:
                attrIndex.append(row.index(attr))
        else:
            tupple = {}
            for i in range(len(attrs)):
                tupple[attrs[i]] = row[attrIndex[i]]
            
            dataset.append(tupple)
                
        cnt += 1
    

print('Number of tuples in dataset: ', len(dataset), '\nA sample tuple:\n', dataset[5])


Number of tuples in dataset:  701352 
A sample tuple:
 {'FL_DATE': '2018-08-01', 'OP_CARRIER_AIRLINE_ID': '19805', 'OP_CARRIER_FL_NUM': '1594', 'ORIGIN': 'DTW', 'DEST': 'ORD', 'ORIGIN_CITY_MARKET_ID': '31295', 'DEST_CITY_MARKET_ID': '30977', 'DEP_DELAY': '-10.00', 'ARR_DELAY': '-7.00'}


### Filtering the dataset

In [30]:
US_major_airports = ['ATL', 'BOS', 'DEN', 'ORD', 'LAX', 'CLT', 'LAS', 'PHX', 'JFK', 'SEA']
filter_dic = {
        # 'FL_DATE': ['2018-08-01', '2018-08-02', '2018-08-03', '2018-08-04', '2018-08-05',
        #             '2018-08-06', '2018-08-07', '2018-08-08', '2018-08-09', '2018-08-10'],     # 10/31
        'ORIGIN': US_major_airports,    # 10/many!
        'DEST': US_major_airports    # 10/many!
}

filtered_dataset = []
for tupple in dataset:
    filter_flag = 0
    for key in filter_dic:
        if tupple[key] not in filter_dic[key]:
            filter_flag = 1
    if not filter_flag:
        filtered_dataset.append(tupple)

print('Dataset size before filtering: ', len(dataset))
# dataset = filtered_dataset
print('Dataset size after filtering: ', len(filtered_dataset))

# for tupple in dataset:
#     print(tupple['ORIGIN'], tupple['DEST'])

Dataset size before filtering:  701352
Dataset size after filtering:  43266


### Making fields human-readable

In [31]:
AirlineMap = {}
with open('AirCarriers') as csvfile:    
    reader = csv.reader(csvfile, delimiter=',')
    cnt = 0
    
    for row in reader:
        if cnt != 0:
            AirlineMap[row[0]] = row[1]
        cnt += 1

AirportMap = {}
with open('Airports') as csvfile:    
    reader = csv.reader(csvfile, delimiter=',')
    cnt = 0
    
    for row in reader:
        if cnt != 0:
            AirportMap[row[0]] = row[1]
        cnt += 1


for i in range(len(dataset)):
    dataset[i]['OP_CARRIER_AIRLINE_ID'] = AirlineMap[dataset[i]['OP_CARRIER_AIRLINE_ID']]
    # dataset[i]['ORIGIN'] = dataset[i]['ORIGIN'] + ': ' + AirportMap[dataset[i]['ORIGIN']]
    # dataset[i]['DEST'] = dataset[i]['DEST'] + ': ' + AirportMap[dataset[i]['DEST']]

# Test:
print(len(dataset))
print(dataset[0])

701352
{'FL_DATE': '2018-08-01', 'OP_CARRIER_AIRLINE_ID': 'American Airlines Inc.: AA', 'OP_CARRIER_FL_NUM': '1587', 'ORIGIN': 'JFK', 'DEST': 'PHX', 'ORIGIN_CITY_MARKET_ID': '31703', 'DEST_CITY_MARKET_ID': '30466', 'DEP_DELAY': '9.00', 'ARR_DELAY': '44.00'}


# 2. Finding the best MFDs

In [6]:
def calculate_MFD_delta(arr, confidence = 0.9):
    n = len(arr)
    removals = n - round(n * confidence)
    best_delta = arr[-1] - arr[0]

    # removing outliers in such a way that minimize delta:
    for i in range(removals+1):
        best_delta = min(best_delta, arr[-(removals-i+1)] - arr[i])
    
    return best_delta

In [7]:
def calculate_MFD_overall_delta(MFD_LHS_attrs, MFD_target, confidence1 = 0.9, confidence2 = 1):
    
    pattern_tuples = {}
    group_patterns = []
    group_values = []

    for tupple in dataset:
        # print(tupple[MFD_target], str(float(tupple[MFD_target])).isnumeric())
        if tupple[MFD_target].replace('.','',1).replace('-','',1).isnumeric():
                
            pattern_tuple = tuple([])
            pattern_dictionary = {}

            for attr in MFD_LHS_attrs:
                pattern_tuple = pattern_tuple + tuple([tupple[attr]])
                pattern_dictionary[attr] = tupple[attr]

            if pattern_tuple not in pattern_tuples:
                pattern_tuples[pattern_tuple] = len(group_patterns)
                group_patterns.append(pattern_dictionary)
                group_values.append([])

            pattern_index = pattern_tuples[pattern_tuple]
            group_values[pattern_index].append(float(tupple[MFD_target]))
    
    group_deltas = []
    for group in group_values:
        group.sort()
        group_delta = calculate_MFD_delta(group, confidence = confidence1)
        group_deltas.append(group_delta)
   
    group_deltas.sort()
    n = len(group_deltas)
    # print(n, MFD_LHS_attrs, MFD_target)
    overall_delta = group_deltas[round(n * confidence2)-1]

    return overall_delta



In [8]:
attrs = ['FL_DATE', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST']

for x in chain.from_iterable(combinations(attrs, r) for r in range(0, len(attrs))):
    general_pattern = list(x)
    print(general_pattern, calculate_MFD_overall_delta(general_pattern, 'DEP_DELAY'))

[] 70.0
['FL_DATE'] 134.0
['OP_CARRIER_AIRLINE_ID'] 195.0
['ORIGIN'] 92.0
['DEST'] 98.0
['FL_DATE', 'OP_CARRIER_AIRLINE_ID'] 312.0
['FL_DATE', 'ORIGIN'] 211.0
['FL_DATE', 'DEST'] 319.0
['OP_CARRIER_AIRLINE_ID', 'ORIGIN'] 244.0
['OP_CARRIER_AIRLINE_ID', 'DEST'] 244.0
['ORIGIN', 'DEST'] 158.0
['FL_DATE', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN'] 1278.0
['FL_DATE', 'OP_CARRIER_AIRLINE_ID', 'DEST'] 872.0
['FL_DATE', 'ORIGIN', 'DEST'] 708.0
['OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST'] 244.0


# 3. Implementing CWCG algorithm

### Function for adding important patterns

In [32]:
def add_patterns(general_patterns):
    global patterns, pattern_members, entry_patterns

    pattern_tuples = {}
    
    for entry_index, entry in enumerate(dataset):
        pattern_tuple = tuple([])
        pattern_dictionary = {}
        
        for attr in general_patterns:
            pattern_tuple = pattern_tuple + tuple([entry[attr]])
            pattern_dictionary[attr] = entry[attr]
        
        if pattern_tuple not in pattern_tuples:
            pattern_tuples[pattern_tuple] = len(patterns)
            patterns.append(pattern_dictionary)
            pattern_members.append([])
        
        pattern_index = pattern_tuples[pattern_tuple]
        pattern_members[pattern_index].append(entry_index)
        entry_patterns[entry_index].append(pattern_index)


### Functions for setting cost of patterns

In [33]:
def calculate_delta(arr, confidence = 0.9):
    n = len(arr)
    removals = n - round(n * confidence)
    best_delta = arr[-1] - arr[0]

    # removing outliers in such a way that minimize delta:
    for i in range(removals+1):
        best_delta = min(best_delta, arr[-(removals-i+1)] - arr[i])
    
    return best_delta

In [34]:
def calculate_costs(MFD_LHS_attrs, MFD_target, base = 2, confidence1 = 0.9, confidence2 = 1, fail_tableaux = False):
    global pattern_costs, pattern_deltas
    pattern_costs = [0 for i in range(len(patterns))]
    pattern_deltas = [0 for i in range(len(patterns))]

    inf = 1e8
    eps = 0.5

    # Calculating fixed-antecedent group deltas:

    global group_deltas, group_members
    group_deltas = {}
    group_members = {}

    for entry_index, entry in enumerate(dataset):
        if not dataset[entry_index][MFD_target].replace('.','',1).replace('-','',1).isnumeric():
            continue

        group_tuple = tuple([])
    
        for attr in MFD_LHS_attrs:
            group_tuple = group_tuple + tuple([entry[attr]])
        
        if group_tuple not in group_members:
            group_members[group_tuple] = []
        group_members[group_tuple].append(float(dataset[entry_index][MFD_target]))
    
    for group_tuple in group_members:
        arr = group_members[group_tuple]
        arr.sort()
        group_deltas[group_tuple] = calculate_delta(arr, confidence=confidence1)


    # Calculating pattern deltas: 

    for i, pattern in enumerate(patterns):

        pattern_groups = set([])

        for entry_index in pattern_members[i]:
            if dataset[entry_index][MFD_target].replace('.','',1).replace('-','',1).isnumeric():
                group_tuple = tuple([])

                for attr in MFD_LHS_attrs:
                    group_tuple = group_tuple + tuple([dataset[entry_index][attr]])

                pattern_groups.add(group_tuple)

                
        if len(pattern_groups) == 0:
            pattern_costs[i] = inf
        else:
            deltas = []
            for group_tuple in pattern_groups:
                deltas.append(group_deltas[group_tuple])
            
            deltas.sort()
            n = len(deltas)
            overall_delta = deltas[round(n * confidence2)-1] 

            pattern_deltas[i] = overall_delta

            if fail_tableaux:
                pattern_costs[i] = (base ** -(overall_delta/15))    
            else:
                pattern_costs[i] = (base ** (overall_delta/15))


### CWSC function

In [35]:
# In this function, we use dataset, patterns, pattern_members, entry_patterns, and patterns as global variables

def CWSC(k, coverage, fail_tableaux = False, max_coverage = 0.2):
    
    total_cost = 0

    # Moving pattern_members to members
    members = [set([]) for i in range(len(patterns))]
    for i in range(len(patterns)):
        for member in pattern_members[i]:
            members[i].add(member)
    
    selected = [0 for i in range(len(patterns))]
    solution_patterns = []
    rem = int(len(dataset) * coverage)
    max_rem = int(len(dataset) * max_coverage)
    
    for i in range(k, 0, -1):
        
        max_gain = 0.0
        max_pattern_index = -1
        
        for index, pattern in enumerate(patterns):
            
            marginal_benefit = len(members[index])
            cost = pattern_costs[index]
            marginal_gain = marginal_benefit / cost
            
            if (selected[index] == 0 and marginal_benefit >= rem/i and marginal_gain > max_gain 
                and (fail_tableaux == False or marginal_benefit <= max_rem)):
                max_gain = marginal_gain
                max_pattern_index = index
        
        if max_pattern_index == -1:
            print('Impossible!')
            return ['Failed!']
        
        # selecting the next pattern
        selected[max_pattern_index] = 1
        total_cost += pattern_costs[max_pattern_index]
        solution_patterns.append(max_pattern_index)
        rem -= len(members[max_pattern_index])
        max_rem -= len(members[max_pattern_index])
        if rem <= 0:
            print('\nTotal cost is ', total_cost, '\n')
            return solution_patterns, total_cost
        
        # updating the marginal benefits
        for entry_index in members[max_pattern_index]:
            for pattern_index in entry_patterns[entry_index]:
                if pattern_index != max_pattern_index:
                    members[pattern_index].remove(entry_index)


### Running the algorithm on the dataset

In [36]:
import timeit

def generate_MFD_tableaux(MFD_LHS_attrs, MFD_target, k = 20, confidence = 0.9, coverage = 0.5, fail_tableaux = False, max_coverage = 0.15):

    # Adding patterns:
    print('\nAdding all the patterns... \n')
    t1 = timeit.default_timer()

    global patterns, pattern_members, entry_patterns
    patterns = []
    pattern_members = []
    entry_patterns = [[] for i in range(len(dataset))]
    for x in chain.from_iterable(combinations(MFD_LHS_attrs, r) for r in range(0, len(MFD_LHS_attrs)+1)):
        general_pattern = list(x)
        print(general_pattern)
        add_patterns(general_pattern)

    t2 = timeit.default_timer()
    print('\nAdding patterns finished!\n')
    print('Time elapsed: ', t2-t1)

    print('Number of patterns: ', len(patterns))
    
    # Calculating pattern costs:
    print('\nCalculating pattern costs...\n')
    calculate_costs(MFD_LHS_attrs, MFD_target, confidence1=confidence, fail_tableaux=fail_tableaux)

    t3 = timeit.default_timer()
    print('\nCalculating pattern costs finished!\n')
    print('Time elapsed: ', t3-t2)

    #Running CWSC algorithm:
    print('\nRunning CWSC algorithm...\n')
    solution_patterns, total_cost = CWSC(k, coverage, fail_tableaux=fail_tableaux, max_coverage=max_coverage)

    t4 = timeit.default_timer()
    print('Time elapsed: ', t4-t3, '\n')
    print('\nTotal elapsed time: ', t4-t1, '\n')

    print('\nTableaux:', '(size = ' + str(len(solution_patterns)) + ')\n')
    for pattern_index in solution_patterns:
        print(patterns[pattern_index], 'delta:', pattern_deltas[pattern_index])

    return solution_patterns, total_cost

In [37]:
copy_dataset = dataset[:]
print(len(filtered_dataset), len(dataset), len(copy_dataset))

# dataset = filtered_dataset
# dataset = copy_dataset
# dataset = copy_dataset[0:100000]

43266 701352 701352


In [17]:
# features = ['location', 'day', 'hour', 'source']

# Actual MFD tableaux: 
a, b = generate_MFD_tableaux(
    MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
    MFD_target = 'DEP_DELAY', 
    k = 50, coverage = 0.6,
    fail_tableaux = False)

# Fail tableaux: 
# generate_MFD_tableaux(
#     MFD_LHS_attrs = ['location', 'day', 'hour'],
#     MFD_target = 'temperature', 
#     k = 10,
#     coverage = 0.05, max_coverage =0.10,
#     fail_tableaux = True)


Adding all the patterns... 

[]
['OP_CARRIER_AIRLINE_ID']
['OP_CARRIER_FL_NUM']
['ORIGIN']
['DEST']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM']
['OP_CARRIER_AIRLINE_ID', 'ORIGIN']
['OP_CARRIER_AIRLINE_ID', 'DEST']
['OP_CARRIER_FL_NUM', 'ORIGIN']
['OP_CARRIER_FL_NUM', 'DEST']
['ORIGIN', 'DEST']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'DEST']
['OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST']
['OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST']

Adding patterns finished!

Time elapsed:  2.5281443789999685
Number of patterns:  21775

Calculating pattern costs...


Calculating pattern costs finished!

Time elapsed:  2.493409298000188

Running CWSC algorithm...


Total cost is  10680.360445154929 

Time elapsed:  1.0029474479997589 


Total elapsed time:  6.024501124999915 


Tableaux: (size = 50)

{'ORIGIN': 'LAX', 'DEST': 'SEA'} delta: 93.0
{'ORIGIN': 'LAS', 'DEST': 'LAX'} delt

In [18]:
print(pattern_deltas[0], pattern_deltas[3000], len(pattern_members[0]))

893.0 147.0 43266


In [38]:
import numpy as np 

print('Dataset size: ', len(dataset))
print('Number of patterns: ', len(pattern_deltas))

print('Average pattern delta: ', np.mean(pattern_deltas))

print('Number of fixed-antecedent groups: ', len(group_deltas.keys()))
all_deltas = []
for key in group_deltas:
    all_deltas.append(group_deltas[key])

print('Average group delta: ', np.mean(all_deltas))
print('Variance of group deltas: ', np.var(all_deltas))
print('STD of group deltas: ', np.std(all_deltas))

import plotly.express as px
fig = px.histogram(x = all_deltas)
fig.show()

Dataset size:  701352
Number of patterns:  21775
Average pattern delta:  46.53363949483352
Number of fixed-antecedent groups:  2787
Average group delta:  42.194833153928954
Variance of group deltas:  2472.406604089493
STD of group deltas:  49.7233004142876


In [39]:
%%capture

import timeit

tableau_sizes = []
running_times = []

# dataset = copy_dataset[0:100000]

for i in range(50, 401, 50):
    
    t1 = timeit.default_timer()
    a, b = generate_MFD_tableaux(
        MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
        MFD_target = 'DEP_DELAY', 
        k = i, coverage = 0.6, confidence = 0.9,
        fail_tableaux = False)
    t2 = timeit.default_timer()

    tableau_sizes.append(i)
    running_times.append(t2-t1)

dataset = copy_dataset[:]

In [43]:
import plotly
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objects as go

fig = go.Figure()

X = []
Y = []
for i in range(len(running_times)):
    # if tableau_sizes[i] in [100, 200, 300, 400, 500, 600]:
    X.append(tableau_sizes[i])
    Y.append(running_times[i])

fig.add_trace(go.Scatter(x=X, y=Y,
                    mode='lines+markers'))

fig.update_layout(
    xaxis = dict(
        # tickmode = 'array',
        # tickvals = dataset_sizes,
        # dtick = 50
    ), 
    yaxis = dict(
        # tickmode = 'linear',
        # tick0 = 0,
        dtick = 20
    )
)

fig.update_yaxes(title="<b>Running time (Sec)<b>")
fig.update_xaxes(title='<b>Input tableau size (k)<b>')
fig.update_layout(yaxis_tickformat = '1')
fig.update_layout(xaxis_tickformat = '1')
fig.for_each_xaxis(lambda axis: axis.title.update(font=dict(size=22)))
fig.for_each_yaxis(lambda axis: axis.title.update(font=dict(size=22)))

fig.update_layout(
    font=dict(
        # family="Courier New, monospace",
        size=22,
        # color="RebeccaPurple"
    )
)

fig.show()

In [None]:
print(len(dataset))
print(len(copy_dataset))

701352
701352


# 3.5 Baseline 

In [20]:
def set_cover(coverage):
    total_cost = 0

    # Moving pattern_members to members
    members = [set([]) for i in range(len(patterns))]
    for i in range(len(patterns)):
        for member in pattern_members[i]:
            members[i].add(member)
    
    selected = [0 for i in range(len(patterns))]
    solution_patterns = []
    global banned_patterns
    rem = int(len(dataset) * coverage)
    
    while True:
        
        max_support = 0
        max_pattern_index = -1
        
        for index, pattern in enumerate(patterns):
            
            marginal_support = len(members[index])
            
            if banned_patterns[index] == 0 and selected[index] == 0 and marginal_support > max_support:
                max_support = marginal_support
                max_pattern_index = index
        
        if max_pattern_index == -1:
            print('Impossible!')
            return ['Failed!']
        
        # selecting the next pattern
        selected[max_pattern_index] = 1
        total_cost += pattern_costs[max_pattern_index]
        solution_patterns.append(max_pattern_index)
        rem -= len(members[max_pattern_index])

        if rem <= 0:
            print('\nTotal cost is ', total_cost, '\n')
            return solution_patterns, total_cost
        
        # updating the marginal supports
        for entry_index in members[max_pattern_index]:
            for pattern_index in entry_patterns[entry_index]:
                if pattern_index != max_pattern_index:
                    members[pattern_index].remove(entry_index)

In [21]:
import heapq

def optimized_set_cover(coverage):
    total_cost = 0

    # Moving pattern_members to members
    members = [set([]) for i in range(len(patterns))]
    for i in range(len(patterns)):
        for member in pattern_members[i]:
            members[i].add(member)
    
    solution_patterns = []
    global banned_patterns
    rem = int(len(dataset) * coverage)

    remaining_patterns = []
    heapq.heapify(remaining_patterns)
    for index, pattern in enumerate(patterns):
        if banned_patterns[index] == 0:
            heapq.heappush(remaining_patterns, (-len(members[index]), index))
    
    
    while True:
        
        _, max_pattern_index = heapq.heappop(remaining_patterns)
        
        # print(_, max_pattern_index)

        # selecting the next pattern
        total_cost += pattern_costs[max_pattern_index]
        solution_patterns.append(max_pattern_index)
        rem -= len(members[max_pattern_index])

        if rem <= 0:
            print('\nTotal cost is ', total_cost, '\n')
            return solution_patterns, total_cost
        
        # updating the marginal supports
        for entry_index in members[max_pattern_index]:
            for pattern_index in entry_patterns[entry_index]:
                if pattern_index != max_pattern_index and banned_patterns[pattern_index] == 0:
                    remaining_patterns.pop(remaining_patterns.index((-len(members[pattern_index]), pattern_index)))
                    members[pattern_index].remove(entry_index)
                    heapq.heappush(remaining_patterns, (-len(members[pattern_index]), pattern_index))


In [22]:
def generate_baseline_MFD_tableaux(MFD_LHS_attrs, MFD_target, max_delta, coverage = 0.6):

    # Adding patterns:
    print('\nAdding all the patterns... \n')

    global patterns, pattern_members, entry_patterns 
    patterns = []
    pattern_members = []
    entry_patterns = [[] for i in range(len(dataset))]

    for x in chain.from_iterable(combinations(MFD_LHS_attrs, r) for r in range(0, len(MFD_LHS_attrs)+1)):
        general_pattern = list(x)
        print(general_pattern)
        add_patterns(general_pattern)

    print('\nAdding patterns finished!\n')

    print('Number of patterns: ', len(patterns))

    # Calculating pattern costs:
    print('\nCalculating pattern costs...\n')
    calculate_costs(MFD_LHS_attrs, MFD_target)
    print('\nCalculating pattern costs finished!\n')

    global banned_patterns
    banned_patterns = [0 for i in range(len(patterns))]
    for i in range(len(patterns)):
        if pattern_deltas[i] > max_delta:
            banned_patterns[i] = 1

    #Running set-cover algorithm:
    print('\nRunning set-cover algorithm...\n')
    solution_patterns, total_cost = set_cover(coverage)
    print('\nTableaux:', '(size = ' + str(len(solution_patterns)) + ')\n')

    for pattern_index in solution_patterns:
        support = round(len(pattern_members[pattern_index])/len(dataset)*100, 2)
        # print(patterns[pattern_index], '\tdelta:', pattern_deltas[pattern_index], ' support(%):', support)

    return solution_patterns, total_cost

In [23]:
def generate_optimized_baseline_MFD_tableaux(MFD_LHS_attrs, MFD_target, max_delta, coverage = 0.6):

    # Adding patterns:
    print('\nAdding all the patterns... \n')

    global patterns, pattern_members, entry_patterns 
    patterns = []
    pattern_members = []
    entry_patterns = [[] for i in range(len(dataset))]

    for x in chain.from_iterable(combinations(MFD_LHS_attrs, r) for r in range(0, len(MFD_LHS_attrs)+1)):
        general_pattern = list(x)
        print(general_pattern)
        add_patterns(general_pattern)

    print('\nAdding patterns finished!\n')

    print('Number of patterns: ', len(patterns))

    # Calculating pattern costs:
    print('\nCalculating pattern costs...\n')
    calculate_costs(MFD_LHS_attrs, MFD_target)
    print('\nCalculating pattern costs finished!\n')

    global banned_patterns
    banned_patterns = [0 for i in range(len(patterns))]
    for i in range(len(patterns)):
        if pattern_deltas[i] > max_delta:
            banned_patterns[i] = 1

    #Running set-cover algorithm:
    print('\nRunning set-cover algorithm...\n')
    solution_patterns, total_cost = optimized_set_cover(coverage)
    print('\nTableaux:', '(size = ' + str(len(solution_patterns)) + ')\n')

    for pattern_index in solution_patterns:
        support = round(len(pattern_members[pattern_index])/len(dataset)*100, 2)
        # print(patterns[pattern_index], '\tdelta:', pattern_deltas[pattern_index], ' support(%):', support)

    return solution_patterns, total_cost

In [None]:
dataset = filtered_dataset

In [None]:
# Baseline MFD tableaux: 
baseline_patterns, baseline_cost = generate_baseline_MFD_tableaux(
    MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
    MFD_target = 'DEP_DELAY', 
    max_delta = 500,
    coverage = 0.6)

# baseline_patterns2, baseline_cost2 = generate_optimized_baseline_MFD_tableaux(
#     MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
#     MFD_target = 'DEP_DELAY', 
#     max_delta = 100,
#     coverage = 0.6)


Adding all the patterns... 

[]
['OP_CARRIER_AIRLINE_ID']
['OP_CARRIER_FL_NUM']
['ORIGIN']
['DEST']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM']
['OP_CARRIER_AIRLINE_ID', 'ORIGIN']
['OP_CARRIER_AIRLINE_ID', 'DEST']
['OP_CARRIER_FL_NUM', 'ORIGIN']
['OP_CARRIER_FL_NUM', 'DEST']
['ORIGIN', 'DEST']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'DEST']
['OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST']
['OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST']
['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST']

Adding patterns finished!

Number of patterns:  370605

Calculating pattern costs...


Calculating pattern costs finished!


Running set-cover algorithm...


Total cost is  19620595718.05318 


Tableaux: (size = 25)



In [24]:
%%capture

baseline_costs = []
baseline_tableau_lens = []
actual_costs = []
actual_tableau_lens = []
deltas = []

for i in tqdm(range(60, 210, 20)):
    baseline_patterns, baseline_cost = generate_baseline_MFD_tableaux(
        MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
        MFD_target = 'DEP_DELAY', 
        max_delta = i,
        coverage = 0.6)
    
    actual_patterns, actual_cost = generate_MFD_tableaux(
        MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
        MFD_target = 'DEP_DELAY',  
        k = len(baseline_patterns), coverage = 0.6,
        fail_tableaux = False)
    
    baseline_costs.append(baseline_cost)
    baseline_tableau_lens.append(len(baseline_patterns))
    actual_costs.append(actual_cost)
    actual_tableau_lens.append(len(actual_patterns))
    deltas.append(i)
    

In [26]:
import plotly
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objects as go

for i in range(len(deltas)):
    print(deltas[i], 'k:', baseline_tableau_lens[i], ' costs: ', baseline_costs[i], actual_costs[i])

  
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(7,5)}, 
    style="white" # nicer layout
)

tick_texts = []
ks = []
dels = []
for i in range(len(deltas)):
    # tick_texts.append(chr(948) + ': ' + str(deltas[i]) + '\n k: ' + str(baseline_tableau_lens[i]))
    ks.append('k=' + str(baseline_tableau_lens[i]))
    dels.append('d=' + str(deltas[i])) 

fig = go.Figure()

fig.add_trace(go.Scatter(x=deltas, y=baseline_costs,
                    textposition="top center",
                    text=dels,
                    mode='lines+markers+text',
                    name='Baseline'))

fig.add_trace(go.Scatter(x=deltas, y=actual_costs, 
                    text=ks,
                    textposition="bottom center",
                    mode='lines+markers+text',
                    name='Our solution'))


fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = deltas,
    ), 
    yaxis = dict(
        # tickmode = 'linear',
        # tick0 = 0,
        # dtick = 1000
    )
)

fig.update_yaxes(title="<b>Tableau Cost<b>")
fig.update_xaxes(title='<b>d in the baseline approach<b>')
fig.update_layout(yaxis_tickformat = '1')
fig.for_each_xaxis(lambda axis: axis.title.update(font=dict(size=20)))
fig.for_each_yaxis(lambda axis: axis.title.update(font=dict(size=20)))

fig.update_layout(
    font=dict(
        # family="Courier New, monospace",
        size=18,
        # color="RebeccaPurple"
    )
)

fig.show()

60 k: 828  costs:  4519.365147604727 3646.3128604939416
80 k: 581  costs:  6596.076105000855 3604.1692807354307
100 k: 312  costs:  9383.853052319297 4716.673125319532
120 k: 136  costs:  10859.296970345356 7527.200544351819
140 k: 55  costs:  14345.04273257141 10400.744387213703
160 k: 35  costs:  22397.49066162297 15616.388520744866
180 k: 28  costs:  34798.30371496649 28932.405863968805
200 k: 17  costs:  50180.114469183165 40929.28129096141


In [44]:
%%capture

import timeit

dataset_sizes = []
running_times_CWSC = []
running_times_SC = []

copy_dataset = dataset[:]

for i in range(100, 701, 100):
    dataset = copy_dataset[0:i*1000]

    t1 = timeit.default_timer()
    a, b = generate_MFD_tableaux(
        MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
        MFD_target = 'DEP_DELAY', 
        k = 100, coverage = 0.6, confidence = 0.9,
        fail_tableaux = False)
    t2 = timeit.default_timer()

    t3 = timeit.default_timer()
    a, b = generate_baseline_MFD_tableaux(
        MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'],
        MFD_target = 'DEP_DELAY', 
        max_delta = 130,
        coverage = 0.6)
    t4 = timeit.default_timer()


    dataset_sizes.append(i)
    running_times_CWSC.append(t2-t1)
    running_times_SC.append(t4-t3)

dataset = copy_dataset[:]
print(len(dataset))

In [51]:
import plotly
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=dataset_sizes, y=running_times_CWSC,
                    mode='lines+markers', name='CWSC'))

# fig.add_trace(go.Scatter(x=dataset_sizes, y=running_times_SC,
#                     mode='lines+markers', name='SC'))

fig.update_layout(
    xaxis = dict(
        # tickmode = 'array',
        # tickvals = dataset_sizes,
        tick0 = 100,
        dtick = 200
    ), 
    yaxis = dict(
        # tickmode = 'linear',
        # tick0 = 0,
        dtick = 20
    )
)

fig.update_yaxes(title="<b>Running time (Sec)<b>")
fig.update_xaxes(title='<b>Database size (thousands)<b>')
fig.update_layout(yaxis_tickformat = '1')
fig.update_layout(xaxis_tickformat = '1')
fig.for_each_xaxis(lambda axis: axis.title.update(font=dict(size=22)))
fig.for_each_yaxis(lambda axis: axis.title.update(font=dict(size=22)))

fig.update_layout(
    font=dict(
        # family="Courier New, monospace",
        size=22,
        # color="RebeccaPurple"
    )
)

fig.show()

In [None]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2299.998
cache size	: 46080 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
bogomips	: 4599.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 bits virtual
power management:

processor	:

# Fail Tableaux!?

### Creating fail_dataset

In [None]:
global patterns, pattern_members, entry_patterns
patterns = []
pattern_members = []
entry_patterns = [[] for i in range(len(dataset))]

add_patterns(['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'])
calculate_costs(['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST'], 'DEP_DELAY')

group_deltas = []
for i in range(len(patterns)):
    group_deltas.append((pattern_deltas[i], i))

group_deltas.sort()

delta_threshhold, _ = group_deltas[int(0.95*len(group_deltas))]

fail_dataset = []
fail_delta = []
non_fail_dataset = []
                                    
for i in range(len(group_deltas)):
    delta, index = group_deltas[i]

    if delta > delta_threshhold:
        fail_dataset.append(patterns[index])
        fail_delta.append(delta)
        # print(patterns[index], delta)
        # for member_index in pattern_members[index]:
        #     print(dataset[member_index])
    else:
        non_fail_dataset.append(patterns[index])

print('Fail dataset is generated!')
print('Fail-dataset size: ', len(fail_dataset))
print('Non-fail dataset size: ', len(non_fail_dataset))

Fail dataset is generated!
Fail-dataset size:  99
Non-fail dataset size:  1956


### Running CWSC algorithm for generating fail tableaux

In [None]:
def add_fail_patterns(general_patterns): 
    global fail_patterns, fail_pattern_members, fail_entry_patterns

    pattern_tuples = {}
    
    for entry_index, entry in enumerate(fail_dataset):
        pattern_tuple = tuple([])
        pattern_dictionary = {}
        
        for attr in general_patterns:
            pattern_tuple = pattern_tuple + tuple([entry[attr]])
            pattern_dictionary[attr] = entry[attr]
        
        if pattern_tuple not in pattern_tuples:
            pattern_tuples[pattern_tuple] = len(fail_patterns)
            fail_patterns.append(pattern_dictionary)
            fail_pattern_members.append([])
        
        pattern_index = pattern_tuples[pattern_tuple]
        fail_pattern_members[pattern_index].append(entry_index)
        fail_entry_patterns[entry_index].append(pattern_index)


In [None]:
def include(patternA, patternB):
    for attr in patternB:
        if attr not in patternA: 
            return False
        if attr in patternA and patternB[attr] != patternA[attr]:
            return False
            
    return True

In [None]:
def calculate_fail_costs():
    global fail_pattern_costs
    fail_pattern_costs = [0 for i in range(len(fail_patterns))]

    for i in range(len(fail_patterns)):
        fail_pattern_costs[i] = 0

        for tupple in non_fail_dataset:
            if include(tupple, fail_patterns[i]):
                fail_pattern_costs[i] += 1

        if fail_pattern_costs[i] == 0:
            fail_pattern_costs[i] = 1
        

In [None]:
def fail_CWSC(k, coverage):
    
    total_cost = 0

    # Moving pattern_members to members
    members = [set([]) for i in range(len(fail_patterns))]
    for i in range(len(fail_patterns)):
        for member in fail_pattern_members[i]:
            members[i].add(member)
    
    selected = [0 for i in range(len(fail_patterns))]
    solution_patterns = []
    rem = int(len(fail_dataset) * coverage)
    
    for i in range(k, 0, -1):
        
        max_gain = 0.0
        max_pattern_index = -1
        
        for index, pattern in enumerate(fail_patterns):
            
            marginal_benefit = len(members[index])
            cost = fail_pattern_costs[index]
            marginal_gain = marginal_benefit / cost
            
            if selected[index] == 0 and marginal_benefit >= rem/i and marginal_gain > max_gain:
                max_gain = marginal_gain
                max_pattern_index = index
        
        if max_pattern_index == -1:
            print('Impossible!')
            return ['Failed!']
        
        # selecting the next pattern
        selected[max_pattern_index] = 1
        total_cost += fail_pattern_costs[max_pattern_index]
        solution_patterns.append(max_pattern_index)
        rem -= len(members[max_pattern_index])

        if rem <= 0:
            print('\nTotal cost is ', total_cost, '\n')
            return solution_patterns
        
        # updating the marginal benefits
        for entry_index in members[max_pattern_index]:
            for pattern_index in fail_entry_patterns[entry_index]:
                if pattern_index != max_pattern_index:
                    members[pattern_index].remove(entry_index)


In [None]:
global fail_patterns, fail_pattern_members, fail_entry_patterns
fail_patterns = []
fail_pattern_members = []
fail_entry_patterns = [[] for i in range(len(dataset))]
MFD_LHS_attrs = ['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST']

for x in chain.from_iterable(combinations(MFD_LHS_attrs, r) for r in range(0, len(MFD_LHS_attrs)+1)):
    general_pattern = list(x)
    add_fail_patterns(general_pattern)

calculate_fail_costs()
solution_patterns = fail_CWSC(k = 10, coverage = 0.5)

print('The worst %5 patterns: \n')
for i in range(len(fail_dataset)):
    print(fail_dataset[i], fail_delta[i])

print('\nRunning CWSC for generating fail tableaux: \n')
for pattern_index in solution_patterns:
    print(fail_patterns[pattern_index])


Total cost is  365 

The worst %5 patterns: 

{'OP_CARRIER_AIRLINE_ID': 'JetBlue Airways: B6', 'OP_CARRIER_FL_NUM': '797', 'ORIGIN': 'BOS', 'DEST': 'ATL'} 171.0
{'OP_CARRIER_AIRLINE_ID': 'Southwest Airlines Co.: WN', 'OP_CARRIER_FL_NUM': '2345', 'ORIGIN': 'LAX', 'DEST': 'PHX'} 171.0
{'OP_CARRIER_AIRLINE_ID': 'Southwest Airlines Co.: WN', 'OP_CARRIER_FL_NUM': '2084', 'ORIGIN': 'DEN', 'DEST': 'SEA'} 171.0
{'OP_CARRIER_AIRLINE_ID': 'JetBlue Airways: B6', 'OP_CARRIER_FL_NUM': '324', 'ORIGIN': 'LAX', 'DEST': 'JFK'} 172.0
{'OP_CARRIER_AIRLINE_ID': 'United Air Lines Inc.: UA', 'OP_CARRIER_FL_NUM': '578', 'ORIGIN': 'ORD', 'DEST': 'BOS'} 172.0
{'OP_CARRIER_AIRLINE_ID': 'Southwest Airlines Co.: WN', 'OP_CARRIER_FL_NUM': '1107', 'ORIGIN': 'DEN', 'DEST': 'LAS'} 173.0
{'OP_CARRIER_AIRLINE_ID': 'American Airlines Inc.: AA', 'OP_CARRIER_FL_NUM': '274', 'ORIGIN': 'LAX', 'DEST': 'JFK'} 174.0
{'OP_CARRIER_AIRLINE_ID': 'American Airlines Inc.: AA', 'OP_CARRIER_FL_NUM': '776', 'ORIGIN': 'CLT', 'DEST': 'L

# Baselines

In [None]:
print(1099511627776/1024/1024/1024)
x = {'a':'b', 'c':'d'}
print(x.keys())

def a():
    global x
    x = [1, 2]

def b():
    global x
    x = [3, 4]
    a()
    print(x)

b()


x = tuple([])
dic = {x:0}
print(dic)
print(dic[x], x in dic)

# for n in range(100):
#     print(n, round(n * 0.9))

ss = False
print(ss == True)

1024.0
dict_keys(['a', 'c'])
[1, 2]
{(): 0}
0 True
False


In [None]:

{'source': 'www.uswx.com', 'day': 'Saturday', 'hour': 12, 'location': 'New York', 'temperature': '15'}
{'source': 'www.climaton.com', 'day': 'Saturday', 'hour': 12, 'location': 'New York', 'temperature': '16'}
{'source': 'www.accuweather.com', 'day': 'Saturday', 'hour': 12, 'location': 'New York', 'temperature': '15'}

{'source': 'weather.cnn.com', 'day': 'Friday', 'hour': 18, 'location': 'Los Angeles', 'temperature': '54'}
{'source': 'www.accuweather.com', 'day': 'Friday', 'hour': 18, 'location': 'Los Angeles', 'temperature': '57'}
{'source': 'weather.herald.com', 'day': 'Friday', 'hour': 12, 'location': 'Los Angeles', 'temperature': '52'}
{'source': 'weather.herald.com', 'day': 'Friday', 'hour': 18, 'location': 'Los Angeles', 'temperature': '57'}
{'source': 'weather.aol.com', 'day': 'Friday', 'hour': 18, 'location': 'Los Angeles', 'temperature': '58'}
{'source': 'www.climaton.com', 'day': 'Monday', 'hour': 18, 'location': 'Los Angeles', 'temperature': '52'}
{'source': 'www.climaton.com', 'day': 'Saturday', 'hour': 12, 'location': 'Los Angeles', 'temperature': '54'}
{'source': 'www.nytimes.com', 'day': 'Friday', 'hour': 18, 'location': 'Los Angeles', 'temperature': '47'}
{'source': 'weather.msn.com', 'day': 'Saturday', 'hour': 18, 'location': 'Los Angeles', 'temperature': '55'}
{'source': 'www.nytimes.com', 'day': 'Friday', 'hour': 12, 'location': 'Los Angeles', 'temperature': '50'}

{'source': 'weather.msn.com', 'day': 'Friday', 'hour': 12, 'location': 'Seattle', 'temperature': '45'}
{'source': 'weather.cnn.com', 'day': 'Friday', 'hour': 12, 'location': 'Seattle', 'temperature': '45'}
{'source': 'www.accuweather.com', 'day': 'Friday', 'hour': 12, 'location': 'Seattle', 'temperature': '49'}
{'source': 'weather.cnn.com', 'day': 'Saturday', 'hour': 12, 'location': 'Seattle', 'temperature': '46'}
{'source': 'www.climaton.com', 'day': 'Friday', 'hour': 18, 'location': 'Seattle', 'temperature': '46'}
{'source': 'www.nytimes.com', 'day': 'Monday', 'hour': 18, 'location': 'Seattle', 'temperature': '41'}
{'source': 'www.nytimes.com', 'day': 'Saturday', 'hour': 12, 'location': 'Seattle', 'temperature': '45'}
{'source': 'search.yahoo.com', 'day': 'Friday', 'hour': 18, 'location': 'Seattle', 'temperature': '46'}

