# AT3 - Airfare Prediction : Data preperation from the zipped raw data - iternaries.zip 

## This is the first notebook to be executed to load and prepare the dataset to be used for modelling. 
1. Call the custom function and unzip all the files in the raw data folder and write as CSV in /data/interim folder
    - Pick and extract one of the csv files as sample and call the custom function for data exploration
2. Extract only required columns from each csv file and write it into interim data folder as CSV files under /data/interim/extractedFeatures folder
    - Pick and extract one of the csv files as sample and call the custom function for data exploration
3. Basic exploration on the time and total fare on a sample data

# 1. Loading the dataset

## 1.1. Launch commands to automatically reload modules

In [1]:
%load_ext autoreload
%autoreload 2

## 1.2. Import the packages and custom functions

In [2]:
import pandas as pd
import numpy as np
import sys
import os
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# Get the current working directory
current_dir = os.getcwd()

# Add the src directory to sys.path
sys.path.append(os.path.abspath(os.path.join(current_dir, '..', 'src')))

# Import the custom functions
from models.data_exploration import display_data_info
from models.data_extraction import extract_nested_zips

## 1.3. Load the dataset

In [3]:
main_zip_file_path = '../data/raw/itineraries_csv.zip'  # Path to your main zip file
output_dir = '../data/interim'        # Directory to extract to

# Calling the custom function to extract the zipped datasets
extract_nested_zips(main_zip_file_path, output_dir)



## 1.4 Load the sample dataset into pandas dataframe and explore

In [4]:
# Extract one of the extracted file and explore
df_sample = pd.read_csv('../data/interim/ATL_itineraries_aa.csv',low_memory=False)

In [5]:
display_data_info(df_sample)

Top 5 rows of data:
                              legId  searchDate  flightDate startingAirport  \
0  9ca0e81111c683bec1012473feefd28f  2022-04-16  2022-04-17             ATL   
1  98685953630e772a098941b71906592b  2022-04-16  2022-04-17             ATL   
2  98d90cbc32bfbb05c2fc32897c7c1087  2022-04-16  2022-04-17             ATL   
3  969a269d38eae583f455486fa90877b4  2022-04-16  2022-04-17             ATL   
4  980370cf27c89b40d2833a1d5afc9751  2022-04-16  2022-04-17             ATL   

  destinationAirport travelDuration  isBasicEconomy  isRefundable  isNonStop  \
0                BOS        PT2H29M           False         False       True   
1                BOS        PT2H30M           False         False       True   
2                BOS        PT2H30M           False         False       True   
3                BOS        PT2H32M           False         False       True   
4                BOS        PT2H34M           False         False       True   

   totalFare  ...  segme

# 2. Data Extraction and preparation (including feature engineering during extraction)

## 2.1 Extract the required columns and write into parquet file in the interim data folder

In [6]:
# Import the custom function
from models.data_extraction import extract_columns_from_csv

In [None]:
folder_path = '../data/interim'  
output_folder = '../data/interim/extractedFeatures1'  

columns_to_extract = [
    'startingAirport',
    'destinationAirport',
    'totalFare',
    'segmentsDepartureTimeRaw',
    'segmentsCabinCode'
]

extract_columns_from_csv(folder_path, output_folder, columns_to_extract)

Extracted columns from 'ATL_itineraries_aa.csv' and saved to '../data/interim/extractedFeatures1\extracted_ATL_itineraries_aa.parquet'.
Deleted original file '../data/interim\ATL_itineraries_aa.csv'.
Extracted columns from 'ATL_itineraries_ab.csv' and saved to '../data/interim/extractedFeatures1\extracted_ATL_itineraries_ab.parquet'.
Deleted original file '../data/interim\ATL_itineraries_ab.csv'.
Extracted columns from 'ATL_itineraries_ac.csv' and saved to '../data/interim/extractedFeatures1\extracted_ATL_itineraries_ac.parquet'.
Deleted original file '../data/interim\ATL_itineraries_ac.csv'.
Extracted columns from 'ATL_itineraries_ad.csv' and saved to '../data/interim/extractedFeatures1\extracted_ATL_itineraries_ad.parquet'.
Deleted original file '../data/interim\ATL_itineraries_ad.csv'.
Extracted columns from 'ATL_itineraries_ae.csv' and saved to '../data/interim/extractedFeatures1\extracted_ATL_itineraries_ae.parquet'.
Deleted original file '../data/interim\ATL_itineraries_ae.csv'.


# 3. Exploring the dataset

## 3.1 Load the sample dataset into pandas dataframe and explore

In [None]:
# Extract one of the extracted file and explore
df_sample = pd.read_parquet('../data/interim/extractedFeatures1/extracted_ATL_itineraries_aa.parquet')

display_data_info(df_sample)

## 3.2 Exploring the time and fare features

In [None]:
# Create a combined column for better visualization
df_sample['departure_time'] = df_sample['departure_hour'].astype(str) + ':' + df_sample['departure_minute'].astype(str).str.zfill(2)

# Set up the matplotlib figure
plt.figure(figsize=(12, 6))

# Create a scatter plot to visualize totalFare vs departure time
sns.scatterplot(data=df_sample, x='departure_time', y='totalFare', hue='departure_hour', palette='viridis', s=100)

# Add labels and title
plt.title('Total Fare vs Departure Time')
plt.xlabel('Departure Time (HH:MM)')
plt.ylabel('Total Fare')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend(title='Departure Hour')
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Create dropdowns for dynamic filtering
starting_airport_dropdown = widgets.Dropdown(
    options=df_sample['startingAirport'].unique(),
    description='Starting Airport:'
)

destination_airport_dropdown = widgets.Dropdown(
    options=df_sample['destinationAirport'].unique(),
    description='Destination Airport:'
)

# Function to update plots based on selected values
def update_plots(selected_starting_airport, selected_destination_airport):
    # Filter the DataFrame
    filtered_df = df_sample[
        (df_sample['startingAirport'] == selected_starting_airport) &
        (df_sample['destinationAirport'] == selected_destination_airport)
    ]

    # Check if filtered DataFrame is empty
    if filtered_df.empty:
        print("No data available for the selected route.")
        return

    # 1. Violin Plot
    plt.figure(figsize=(12, 6))
    sns.violinplot(data=filtered_df, x='departure_minute', y='totalFare', inner='quartile', palette='viridis')
    plt.title(f'Total Fare Distribution by Departure Minute (From {selected_starting_airport} to {selected_destination_airport})')
    plt.xlabel('Departure Minute')
    plt.ylabel('Total Fare')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


    # 2. Facet Grid
    g = sns.FacetGrid(filtered_df, col='departure_hour', col_wrap=3, height=4, sharey=False)
    g.map(sns.boxplot, 'departure_minute', 'totalFare', order=sorted(filtered_df['departure_minute'].unique()))
    g.set_titles(col_template='Hour: {col_name}')
    g.set_axis_labels('Departure Minute', 'Total Fare')
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle(f'Total Fare by Departure Minute for Each Hour (From {selected_starting_airport} to {selected_destination_airport})')
    plt.show()

# Link dropdowns to update function
widgets.interactive(update_plots, 
                    selected_starting_airport=starting_airport_dropdown, 
                    selected_destination_airport=destination_airport_dropdown)