In [1]:
import json
import pandas as pd

In [2]:
from amadeus import Client, ResponseError

amadeus = Client(
    client_id='Iypd66o7JF7XLjpO8Qkq4ryb0Vp0bSKb',
    client_secret='d49teX5dV3xPA2SS'
)

In [78]:
try:
    response = amadeus.shopping.flight_offers_search.get(
        originLocationCode='AKL',
        destinationLocationCode='NRT',
        departureDate='2024-04-12',
        adults=1)
#     print(response.data)
except ResponseError as error:
    print(error)

In [79]:
def parse_duration(duration_str):
    """
    Parses a duration string in ISO 8601 format and returns the total duration in minutes.
    """
    hours = 0
    minutes = 0
    if 'H' in duration_str:
        hours = int(duration_str.split('H')[0].replace('PT', ''))
        if 'M' in duration_str:
            minutes = int(duration_str.split('H')[1].replace('M', ''))
    elif 'M' in duration_str:
        minutes = int(duration_str.replace('PT', '').replace('M', ''))
    return hours * 60 + minutes

# Function to categorize departure time
def time_of_day(hour):
    if 0 <= hour < 6:
        return 'Early Morning'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
    
# Function to categorize flights as Short-haul, Medium-haul, or Long-haul
def categorize_flight_haul(duration):
    if duration <= 180: # Short-haul flights are less than or equal to 3 hours
        return 'Short-haul'
    elif duration <= 360: # Medium-haul flights are between 3 to 6 hours
        return 'Medium-haul'
    else: # Long-haul flights are more than 6 hours
        return 'Long-haul'
    
def is_overnight_flight(row):
    """
    Determine if a flight is an overnight flight.
    A flight is considered overnight if it arrives on the next day after it departs.
    """
    departure_hours = row['DepartureDateTime'].hour
    arrival_hours = row['ArrivalDateTime'].hour
    return ((arrival_hours < departure_hours) & (row['TotalDuration'] > 60)) | ((arrival_hours + 24 - departure_hours) * 60 < row['TotalDuration'])

def parse_all_data(flight_data):
    # Initialize data holders
    segments_data = []
    prices_data = []
    flights_data = []
    fares_data = []
    
    itinerary_id_counter = 1
    flight_id_counter = 1
    
    for offer in flight_data:
        for itinerary in offer["itineraries"]:
            # Parse itinerary information
            segments = itinerary["segments"]
            itinerary_duration = parse_duration(itinerary["duration"])
            # Parse segment information for each segment in the itinerary
            for segment in segments:
                aircraft_code = segment["aircraft"]["code"] if "aircraft" in segment else "Unknown"
                segment_duration_minutes = parse_duration(segment["duration"])
                segments_data.append({
                    "ItineraryID": itinerary_id_counter,
                    "FlightID": flight_id_counter,
                    "Duration": segment_duration_minutes,
                    "Departure": segment["departure"]["iataCode"],
                    "Arrival": segment["arrival"]["iataCode"],
                    "CarrierCode": segment["carrierCode"],
                    "DepartureDateTime": segment["departure"]["at"],
                    "ArrivalDateTime": segment["arrival"]["at"],
                    "NumberOfStops": segment["numberOfStops"],
                    "AircraftCode": aircraft_code
                })
                
                # Add fare details for each segment if available
                for traveler_pricing in offer["travelerPricings"]:
                    for fare_detail in traveler_pricing["fareDetailsBySegment"]:
                        if fare_detail["segmentId"] == segment["id"]:
                            fares_data.append({
                                "ItineraryID": itinerary_id_counter,
                                "FlightID": flight_id_counter,
                                "Cabin": fare_detail["cabin"],
                                "Class": fare_detail["class"],
                                "FareBasis": fare_detail["fareBasis"],
                                "IncludedCheckedBagsQuantity": fare_detail.get("includedCheckedBags", {}).get("quantity", 0)
                            })
                
                flight_id_counter += 1
            
            # Parse flight information based on the whole itinerary
            flights_data.append({
                "ItineraryID": itinerary_id_counter,
                "Departure": segments[0]["departure"]["iataCode"],
                "Arrival": segments[-1]["arrival"]["iataCode"],
                "DepartureDateTime": segments[0]["departure"]["at"],
                "ArrivalDateTime": segments[-1]["arrival"]["at"],
                "TotalDuration": itinerary_duration,
                "NumberOfStops": len(segments) - 1,
            })
            
            # Prices for each itinerary
            prices_data.append({
                "ItineraryID": itinerary_id_counter,
                "Currency": offer["price"]["currency"],
                "Total": offer["price"]["total"],
                "Base": offer["price"]["base"],
            })

            itinerary_id_counter += 1
            
    segments_data = pd.DataFrame(segments_data)
    segments_data['DepartureDateTime'] = pd.to_datetime(segments_data['DepartureDateTime'])
    segments_data['ArrivalDateTime'] = pd.to_datetime(segments_data['ArrivalDateTime'])
    
    flights_data = pd.DataFrame(flights_data)
    flights_data['DepartureDateTime'] = pd.to_datetime(flights_data['DepartureDateTime'])
    flights_data['ArrivalDateTime'] = pd.to_datetime(flights_data['ArrivalDateTime'])
    flights_data['Departure_TimeOfDay'] = flights_data['DepartureDateTime'].dt.hour.apply(time_of_day)
    flights_data['IsWeekend_Departure'] = flights_data['DepartureDateTime'].dt.dayofweek >= 5
    flights_data['FlightHaul'] = flights_data['TotalDuration'].apply(categorize_flight_haul)
    flights_data['IsNonStop'] = flights_data['NumberOfStops'] == 0
    flights_data['IsOverNightFlight'] = flights_data.apply(is_overnight_flight, axis=1)
    
    prices_data = pd.DataFrame(prices_data)
    prices_data['Total'] = prices_data['Total'].astype(float)
    prices_data['Base'] = prices_data['Base'].astype(float)
    prices_data['FareCategory'] = pd.qcut(prices_data['Total'], q=3, labels=['Low', 'Medium', 'High'])
    prices_data['TotalZScore'] = (prices_data['Total'] - prices_data['Total'].mean()) / prices_data['Total'].std()

    fares_data = pd.DataFrame(fares_data)
    fares_data['FlightsPerItinerary'] = fares_data.groupby('ItineraryID')['FlightID'].transform('count')
    
    return segments_data, prices_data, flights_data, fares_data

segments_data, prices_data, flights_data, fares_data = parse_all_data(response.data)

In [82]:
fares_data.sample(5)

Unnamed: 0,ItineraryID,FlightID,Cabin,Class,FareBasis,IncludedCheckedBagsQuantity,FlightsPerItinerary
18,10,19,ECONOMY,T,TLONZ,2,2
29,16,30,PREMIUM_ECONOMY,R,RR99NNEO,2,2
7,4,8,ECONOMY,S,SSJN,0,2
24,13,25,ECONOMY,C,CLOW2,0,2
13,7,14,ECONOMY,S,SSJN,0,2


In [83]:
prices_data.sample(5)

Unnamed: 0,ItineraryID,Currency,Total,Base,FareCategory,TotalZScore
13,14,EUR,1153.7,979.0,Medium,-0.383691
18,19,EUR,1182.47,979.0,Medium,-0.363506
27,28,EUR,3254.52,3117.0,High,1.090267
20,21,EUR,1822.11,1719.0,Medium,0.085273
6,7,EUR,619.66,500.0,Low,-0.75838


In [84]:
segments_data.sample(5)

Unnamed: 0,ItineraryID,FlightID,Duration,Departure,Arrival,CarrierCode,DepartureDateTime,ArrivalDateTime,NumberOfStops,AircraftCode
12,7,13,250,AKL,MEL,QF,2024-04-12 20:00:00,2024-04-12 22:10:00,0,73H
41,22,42,665,AKL,NRT,NH,2024-04-12 08:45:00,2024-04-12 16:50:00,0,789
59,31,60,540,HNL,NRT,HA,2024-04-12 12:00:00,2024-04-13 16:00:00,0,332
33,18,34,685,AKL,HKG,CX,2024-04-12 08:40:00,2024-04-12 16:05:00,0,789
50,27,51,750,AKL,PVG,MU,2024-04-12 21:00:00,2024-04-13 05:30:00,0,332


In [85]:
flights_data.sample(5)

Unnamed: 0,ItineraryID,Departure,Arrival,DepartureDateTime,ArrivalDateTime,TotalDuration,NumberOfStops,Departure_TimeOfDay,IsWeekend_Departure,FlightHaul,IsNonStop,IsOverNightFlight
19,20,AKL,NRT,2024-04-12 22:55:00,2024-04-13 14:25:00,1110,1,Night,False,Long-haul,False,True
23,24,AKL,NRT,2024-04-12 00:25:00,2024-04-12 18:05:00,1240,1,Early Morning,False,Long-haul,False,False
10,11,AKL,NRT,2024-04-12 00:15:00,2024-04-12 18:05:00,1250,1,Early Morning,False,Long-haul,False,False
15,16,AKL,NRT,2024-04-12 08:40:00,2024-04-13 14:10:00,1950,1,Morning,False,Long-haul,False,True
14,15,AKL,NRT,2024-04-12 08:40:00,2024-04-13 13:10:00,1890,1,Morning,False,Long-haul,False,True


In [None]:
# from datetime import datetime

# # Define the datetime format
# fmt = '%Y-%m-%dT%H:%M:%S'

# # Parse the datetime strings
# start = datetime.strptime('2024-04-05T23:30:00', fmt)
# end = datetime.strptime('2024-04-06T00:20:00', fmt)

# # Calculate the duration in minutes
# duration = (end - start).total_seconds() / 60
# duration