# Collette Report

In [None]:
import numpy as np
import pandas as pd
import pandas_profiling
from datetime import datetime
from matplotlib import pyplot as plt

## Set Variables

In [None]:
report_ID = 2020090
today = datetime(year=2020, month=9, day=18)
current_raw_file = 'Reports/collette_raw_data_09-18-20.csv'
previous_report_file = 'Reports/collette_data_2020_09-11-20.csv'
new_report_file = 'Reports/collette_data_2020_09-18-20.csv'

## Build Current Data

In [None]:
curr_df = pd.read_csv(current_raw_file).drop_duplicates(subset=['DepartureID', 'field']).pivot(index='DepartureID', columns='field', values='value').reset_index()
curr_df = curr_df[['DepartureID', 'DepartureDate', 'ActualPriceUSD', 'OriginalPriceUSD', 'ActualPriceAUD', 'OriginalPriceAUD', 'Available', 'Status', 'Notes']]
curr_df.insert(0, 'ReportID', report_ID)
curr_df.insert(8, 'Type', '')
curr_df.insert(10, 'Pax', np.NaN)
curr_df['DepartureDate'] = pd.to_datetime(curr_df['DepartureDate'], format='%d-%b-%Y')
curr_df.sort_values(by=['DepartureDate'], ascending=True, inplace=True)
old_departure_code = curr_df['DepartureID'].str.split(pat='-', expand=True)[1]
fixed_departure_id = curr_df['DepartureID'].str.split(pat='-', expand=True)[0] + '-' + old_departure_code.str[0:3] + old_departure_code.str[-3:]
curr_df['DepartureID'] = fixed_departure_id
curr_df.loc[curr_df['ActualPriceUSD'] == 'Call For Air', 'ActualPriceUSD'] = np.NaN
curr_df['ActualPriceUSD'] = curr_df['ActualPriceUSD'].str.replace('$', '').astype(float)
curr_df.loc[curr_df['OriginalPriceUSD'] == 'Call For Air', 'OriginalPriceUSD'] = np.NaN
curr_df['OriginalPriceUSD'] = curr_df['OriginalPriceUSD'].str.replace('$', '').astype(float)
curr_df['ActualPriceAUD'] = curr_df['ActualPriceAUD'].str.replace('$', '').astype(float)
curr_df['OriginalPriceAUD'] = curr_df['OriginalPriceAUD'].str.replace('$', '').astype(float)
curr_df.dropna(subset=['Available'], inplace=True)
booleanDictionary = {'True': True, 'False': False}
curr_df['Available'] = curr_df['Available'].replace(booleanDictionary)
curr_df = curr_df.loc[curr_df['DepartureDate'] < datetime(year=2021, month=1, day=1, hour=0, minute=0)]
curr_df.loc[curr_df['Status'] == 'Expires 04/30/2020', 'Status'] = 'Available'
curr_df

## Build Previous Data

In [None]:
prev_df = pd.read_csv(previous_report_file)
prev_df['ReportID'] = prev_df['ReportID'].astype(int)
departure_code = prev_df['DepartureID'].str.split(pat='-', expand=True)[1]
day_numbers = departure_code.str[0:2]
get_char = lambda x : str(ord(x[2]) - 64)
month_numbers = departure_code.apply(get_char)
year_numbers = departure_code.str[3:5]
departure_date = pd.to_datetime(day_numbers + '-' + month_numbers + '-' + year_numbers, format='%d-%m-%y')
prev_df.insert(2, 'DepartureDate', departure_date)
prev_df.sort_values(by=['DepartureDate'], ascending=True, inplace=True)
# prev_df['ActualPriceUSD'] = prev_df['ActualPriceUSD'].str.replace('$', '')
# prev_df['ActualPriceUSD'] = prev_df['ActualPriceUSD'].str.replace(',', '').astype(float)
# prev_df['OriginalPriceUSD'] = prev_df['OriginalPriceUSD'].str.replace('$', '')
# prev_df['OriginalPriceUSD'] = prev_df['OriginalPriceUSD'].str.replace(',', '').astype(float)
# prev_df['ActualPriceAUD'] = prev_df['ActualPriceAUD'].str.replace('$', '')
# prev_df['ActualPriceAUD'] = prev_df['ActualPriceAUD'].str.replace(',', '').astype(float)
# prev_df['OriginalPriceAUD'] = prev_df['OriginalPriceAUD'].str.replace('$', '')
# prev_df['OriginalPriceAUD'] = prev_df['OriginalPriceAUD'].str.replace(',', '').astype(float)
prev_df['Available'] = prev_df['Available'].astype(bool)
prev_df['Notes'] = prev_df['Notes'].astype(str)
prev_df['Notes'] = prev_df['Notes'].str.replace('nan', '')
prev_df.loc[prev_df['Status'] == 'Expires 04/15/2020', 'Status'] = 'Available'
prev_df.drop(prev_df[prev_df['ActualPriceUSD'].isna()].index, inplace=True)
prev_df

## Check Departure/Status Count

In [None]:
curr_plot_df = curr_df['DepartureID'].groupby([curr_df['DepartureDate'].dt.month, curr_df['Status']]).count()
prev_plot_df = prev_df['DepartureID'].groupby([prev_df['DepartureDate'].dt.month, prev_df['Status']]).count()
plot_df = pd.concat([prev_plot_df, curr_plot_df], axis=1)
plot_df.columns = ['Previous', 'Current']
plot_df = plot_df.unstack(level=-1, fill_value=0)

months = plot_df.index
month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

previous_available_counts = plot_df['Previous']['Available']
previous_available_counts.loc[previous_available_counts.isna()] = 0
previous_limited_counts = plot_df['Previous']['Limited']
previous_limited_counts.loc[previous_limited_counts.isna()] = 0
previous_soldout_counts = plot_df['Previous']['Sold Out']
previous_soldout_counts.loc[previous_soldout_counts.isna()] = 0
previous_soldout_counts
previous_cancelled_counts = plot_df['Previous']['Cancelled']
previous_cancelled_counts.loc[previous_cancelled_counts.isna()] = 0
previous_removed_counts = plot_df['Previous']['Cancelled / Sold Out']
previous_removed_counts.loc[previous_removed_counts.isna()] = 0

current_available_counts = plot_df['Current']['Available']
current_available_counts.loc[current_available_counts.isna()] = 0
current_limited_counts = plot_df['Current']['Limited']
current_limited_counts.loc[current_limited_counts.isna()] = 0
current_soldout_counts = plot_df['Current']['Sold Out']
current_soldout_counts.loc[current_soldout_counts.isna()] = 0
current_cancelled_counts = plot_df['Current']['Cancelled']
current_cancelled_counts.loc[current_cancelled_counts.isna()] = 0
current_removed_counts = plot_df['Current']['Cancelled / Sold Out']
current_removed_counts.loc[current_cancelled_counts.isna()] = 0

plt.style.use('fivethirtyeight')
width = 0.35
fig, ax = plt.subplots()

ax.bar(months - width/2, previous_available_counts, width=width, color='#1666BE', label='Available')
ax.bar(months - width/2, previous_limited_counts, width=width, color='#418C5C', bottom=previous_available_counts, label='Limited')
ax.bar(months - width/2, previous_soldout_counts, width=width, color='#CE8D48', bottom=previous_available_counts + previous_limited_counts, label='Sold Out')
ax.bar(months - width/2, previous_cancelled_counts, width=width, color='#A4A49C', bottom=previous_available_counts + previous_limited_counts + previous_soldout_counts, label='Cancelled')
ax.bar(months - width/2, previous_removed_counts, width=width, color='#AD1E45', bottom=previous_available_counts + previous_limited_counts + previous_soldout_counts + previous_cancelled_counts, label='Removed from Website')

ax.bar(months + width/2, current_available_counts, width=width, color='#1666BE')
ax.bar(months + width/2, current_limited_counts, width=width, color='#418C5C', bottom=current_available_counts)
ax.bar(months + width/2, current_soldout_counts, width=width, color='#CE8D48', bottom=current_available_counts + current_limited_counts)
ax.bar(months + width/2, current_cancelled_counts, width=width, color='#A4A49C', bottom=current_available_counts + current_limited_counts + current_soldout_counts)

plt.xticks(ticks=months, labels=month_labels)
plt.legend()
plt.show()

## Check Departure/Status Changes

In [None]:
curr_status_df = curr_df[['DepartureID', 'DepartureDate', 'Status']].set_index(['DepartureID', 'DepartureDate'])
prev_status_df = prev_df[['DepartureID', 'DepartureDate', 'Status']].set_index(['DepartureID', 'DepartureDate'])
status_df = pd.concat([prev_status_df, curr_status_df], axis=1).reset_index('DepartureDate').sort_values(by='DepartureDate')
status_df.columns = ['DepartureDate', 'Previous', 'Current']

### New Departures

In [None]:
new_departures = status_df.loc[status_df['Previous'].isna()]
new_departures

### Removed Departures

In [None]:
removed_departures = status_df.loc[(status_df['Current'].isna()) & (status_df['DepartureDate'] > today)]
removed_departures

### Status Changes

In [None]:
changed_departures = status_df.loc[(status_df['Previous'] != status_df['Current']) & (status_df['Current'].notna())]
changed_departures

## Filter Previous-Past Departures

### Departure date < Today

In [None]:
prev_past_df = prev_df[prev_df['DepartureDate'] <= today].sort_values(by=['DepartureDate'], ascending=True)
prev_past_df

## Filter Previous-Future Departures

### Departure date > Today
#### If previously 'Available' --> Status = 'Cancelled / Sold Out'

In [None]:
prev_future_df = prev_df.loc[prev_df['DepartureDate'] > today].sort_values(by=['DepartureDate'], ascending=True)
filt = (prev_future_df['Available'] == True)
prev_future_df.loc[filt, ['Available', 'Status', 'Notes']] = [False, 'Cancelled / Sold Out', 'Removed from website']
prev_future_df

## Combine Current and Previous-Past Data

In [None]:
new_df = pd.concat([curr_df, prev_past_df]).sort_values(by=['DepartureDate'], ascending=True)
new_df

## Combine Current and Previous-Future Data

In [None]:
new_df = pd.concat([new_df, prev_future_df]).sort_values(by=['ReportID', 'DepartureID'], ascending=True).drop_duplicates(subset='DepartureID', keep='last').sort_values(by='DepartureDate', ascending=True)
new_df

## Check Mixed Data

In [None]:
# prof = pandas_profiling.ProfileReport(new_df)
# prof.to_file(output_file='gate1_report.html')

## Export CSV

In [None]:
new_df.drop(columns='DepartureDate', inplace=True)
new_df['ReportID'] = report_ID
new_df.set_index(['ReportID', 'DepartureID'], verify_integrity=True, inplace=True)
new_df.to_csv(new_report_file)