# Dataset Generation

In [None]:
%pip install pandas ipywidgets 

In [None]:
%pip install faker

In [18]:
import random
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
from faker import Faker
from collections import Counter


# Initialize Faker for generating random addresses
fake = Faker()

# Configuration Dictionaries
total_bags_per_flight = {
    128: 300,
    250: 650,
    300: 780,
    350: 870
}

airline_country_codes = {
    "176": "+971",  # UAE
    "001": "+1",    # USA
    "125": "+44",   # UK
    "157": "+974",  # Qatar
    "131": "+81"    # Japan
}

# File path for saving output
output = widgets.Output()
desktop_path = os.path.expanduser("~/Desktop/A")
os.makedirs(desktop_path, exist_ok=True)
file_path = os.path.join(desktop_path, "luggage_data_sle.csv")


def get_cluster_size_distribution(luggage_dist, small_rng, large_rng, equal_rng):
    """
    Calculates and returns the probability distribution for cluster sizes based on the chosen luggage distribution type.

    Args:
        luggage_dist (str): Type of luggage distribution ('small', 'large', or 'equal').
        small_rng (tuple): Range of cluster sizes for 'small' distribution (min, max).
        large_rng (tuple): Range of cluster sizes for 'large' distribution (min, max).
        equal_rng (tuple): Range of cluster sizes for 'equal' distribution (min, max).

    Returns:
        dict: A dictionary mapping each allowed cluster size to its probability.
              - 'small' and 'large': Uniform distribution over their respective ranges.
              - 'equal':  Sizes in the union of 'small' and 'large' ranges have higher probability (0.95 total weight),
                         distributed evenly. Sizes unique to 'equal' range (rare sizes) have lower probability (0.05 total weight),
                         distributed evenly. If no rare sizes, 'equal' range is uniformly distributed.
    """
    probabilities = {}
    if luggage_dist == "small":
        sizes = list(range(small_rng[0], small_rng[1] + 1))
        if sizes:
            uniform_prob = 1.0 / len(sizes)
            for size in sizes:
                probabilities[size] = uniform_prob
    elif luggage_dist == "large":
        sizes = list(range(large_rng[0], large_rng[1] + 1))
        if sizes:
            uniform_prob = 1.0 / len(sizes)
            for size in sizes:
                probabilities[size] = uniform_prob
    elif luggage_dist == "equal":
        small_set = set(range(small_rng[0], small_rng[1] + 1))
        large_set = set(range(large_rng[0], large_rng[1] + 1))
        equal_set = set(range(equal_rng[0], equal_rng[1] + 1))

        union_set = sorted(list(small_set | large_set))
        rare_set = sorted(list(equal_set - (small_set | large_set)))

        if union_set and rare_set:
            union_weight = 0.95
            rare_weight = 0.05
            for size in union_set:
                probabilities[size] = union_weight / len(union_set)
            for size in rare_set:
                probabilities[size] = rare_weight / len(rare_set)
        else:
            sizes = sorted(list(equal_set))
            if sizes:
                uniform_prob = 1.0 / len(sizes)
                for size in sizes:
                    probabilities[size] = uniform_prob
    return probabilities

def generate_dataset(passenger_capacity, luggage_dist, slhs_percent, small_rng, large_rng, equal_rng,
                     percent_0_bag, percent_1_bag, percent_2_bag):
    """
    Generates a luggage dataset for flights based on passenger capacity, luggage distribution, and other parameters.

    Ensures that bags of passengers with 2 bags are always in the same cluster (reservation group).
    Respects the total bag limit per flight based on passenger capacity.

    Args:
        passenger_capacity (int): Number of passengers for the flight (e.g., 128, 250, 300, 350).
        luggage_dist (str): Luggage distribution type ('small', 'large', 'equal').
        slhs_percent (float): Percentage of bags handled by SLHS (Self-Loading Handling System).
        small_rng (tuple): Range for 'small' cluster size distribution.
        large_rng (tuple): Range for 'large' cluster size distribution.
        equal_rng (tuple): Range for 'equal' cluster size distribution.
        percent_0_bag (float): Percentage of passengers with 0 bags.
        percent_1_bag (float): Percentage of passengers with 1 bag.
        percent_2_bag (float): Percentage of passengers with 2 bags.

    Returns:
        pandas.DataFrame: DataFrame containing the generated luggage dataset.
    """
    data = []
    bag_counter = 1
    reservation_counter = 1

    # Airline setup (using first 3 airlines for flight number generation)
    airlines = ["176", "001", "125", "157", "131"]
    iata_codes = {"176": "EK", "001": "AA", "125": "BA", "157": "QR", "131": "JL"}
    flight_numbers = [f"{iata_codes[airline]}{random.randint(1000,9999)}" for airline in airlines[:3]]

    for flight_number in flight_numbers:
        airline_code = airlines[flight_numbers.index(flight_number) % 5]
        country_code = airline_country_codes[airline_code]
        passenger_bag_queue = []
        total_bags = 0
        max_bags_for_flight = total_bags_per_flight[passenger_capacity]

        # Passenger allocation based on percentages and respecting total bag limit
        num_0 = int(passenger_capacity * (percent_0_bag / 100.0))
        num_1 = int(passenger_capacity * (percent_1_bag / 100.0))
        num_2 = passenger_capacity - num_0 - num_1

        def generate_phone_inner():
            """Helper function to generate a phone number with country code."""
            phone_number = f"+{country_code[1:]}{random.randint(1000000000, 9999999999):010d}"
            return f'="{phone_number}"'

        passengers_0_bag = [generate_phone_inner() for _ in range(num_0)]

        passengers_1_bag = []
        for _ in range(num_1):
            if total_bags + 1 > max_bags_for_flight:
                break
            passengers_1_bag.append(generate_phone_inner())
            total_bags += 1

        passengers_2_bag = []
        for _ in range(num_2):
            if total_bags + 2 > max_bags_for_flight:
                break
            passengers_2_bag.append(generate_phone_inner())
            total_bags += 2

        # Combine passengers with bags and shuffle
        passengers_with_bags = []
        passengers_with_bags.extend([(phone, 0) for phone in passengers_0_bag])
        passengers_with_bags.extend([(phone, 1) for phone in passengers_1_bag])
        passengers_with_bags.extend([(phone, 2) for phone in passengers_2_bag])
        random.shuffle(passengers_with_bags)

        # Create bag data entries and passenger bag queues
        for phone, num_bags in passengers_with_bags:
            if num_bags == 0:
                data.append({
                    "Flight_Number": flight_number,
                    "Bag_ID": None,
                    "Phone_Number": phone,
                    "Reservation_ID": None,
                    "Number_of_Bags": 0,
                    "Handled_by_SLHS": None,
                    "Claim_Option": None,
                    "Delivery_Address": None,
                    "Bag_Assignment_Status": "No Luggage"
                })
            else:
                bag_group = []
                for _ in range(num_bags):
                    bag_id = f"{airline_code}-{flight_number}-{bag_counter:06d}"
                    bag_counter += 1
                    bag_group.append({
                        "Flight_Number": flight_number,
                        "Bag_ID": bag_id,
                        "Phone_Number": phone,
                        "Number_of_Bags": num_bags,
                        "Handled_by_SLHS": None,
                        "Claim_Option": None,
                        "Delivery_Address": None,
                    })
                passenger_bag_queue.append(bag_group)

        # --- Cluster (Reservation Group) Assignment Logic ---
        two_bag_groups = [group for group in passenger_bag_queue if len(group) == 2]
        one_bag_groups = [group for group in passenger_bag_queue if len(group) == 1]

        if luggage_dist == "equal":
            allowed_min, allowed_max = equal_rng[0], equal_rng[1]
        else:
            range_map = {"small": small_rng, "large": large_rng}
            allowed_min, allowed_max = range_map[luggage_dist]

        clusters = []
        cluster_size_counts = Counter()
        cluster_probabilities = get_cluster_size_distribution(luggage_dist, small_rng, large_rng, equal_rng)

        def total_remaining(twos, ones):
            return 2 * len(twos) + len(ones)

        while two_bag_groups or one_bag_groups:
            rem = total_remaining(two_bag_groups, one_bag_groups)
            if rem < allowed_min:
                cluster = two_bag_groups + one_bag_groups
                two_bag_groups = []
                one_bag_groups = []
                clusters.append(cluster)
                break

            possible_sizes_this_round = []
            for group_size, prob in cluster_probabilities.items():
                if group_size > rem:
                    continue
                if group_size % 2 == 0:
                    needed_twos = group_size // 2
                    if len(two_bag_groups) >= needed_twos:
                        leftover = rem - group_size
                        if leftover >= 0 and (leftover == 0 or leftover >= allowed_min):
                            possible_sizes_this_round.append(('even', group_size, needed_twos))
                else:
                    needed_twos = (group_size - 1) // 2
                    if len(two_bag_groups) >= needed_twos and len(one_bag_groups) >= 1:
                        leftover = rem - group_size
                        if leftover >= 0 and (leftover == 0 or leftover >= allowed_min):
                            possible_sizes_this_round.append(('odd', group_size, needed_twos))

            if possible_sizes_this_round:
                probabilities_for_selection = [cluster_probabilities[size[1]] for size in possible_sizes_this_round]
                chosen_candidate_index = random.choices(range(len(possible_sizes_this_round)), weights=probabilities_for_selection, k=1)[0]
                ctype, group_size, needed_twos = possible_sizes_this_round[chosen_candidate_index]

                cluster = []
                if ctype == 'even':
                    for _ in range(needed_twos):
                        cluster.append(two_bag_groups.pop(0))
                else:
                    for _ in range(needed_twos):
                        cluster.append(two_bag_groups.pop(0))
                    cluster.append(one_bag_groups.pop(0))

                clusters.append(cluster)
                cluster_size_counts[group_size] += 1
            else:
                # Fallback: Greedy cluster filling
                cluster = []
                cluster_bag_count = 0
                while (two_bag_groups or one_bag_groups) and cluster_bag_count < allowed_max:
                    if two_bag_groups and cluster_bag_count + 2 <= allowed_max:
                        cluster.append(two_bag_groups.pop(0))
                        cluster_bag_count += 2
                    elif one_bag_groups and cluster_bag_count + 1 <= allowed_max:
                        cluster.append(one_bag_groups.pop(0))
                        cluster_bag_count += 1
                    else:
                        break
                if cluster:
                    clusters.append(cluster)
                    cluster_size = sum(len(group) for group in cluster)
                    cluster_size_counts[cluster_size] += 1

        # Finalize cluster data: Reservation IDs, SLHS handling, Claim Options, Delivery Addresses
        for cluster in clusters:
            cluster_bags = [bag for group in cluster for bag in group]
            total_bags_in_cluster = len(cluster_bags)
            handled = random.random() < slhs_percent
            claim = "SLHS Electronic Gates" if handled else random.choice(["Delivery", "Pickup Outside the Terminal"])
            address = fake.address() if claim == "Delivery" else None
            res_id = f"RES-{reservation_counter:05d}{total_bags_in_cluster:02d}"
            reservation_counter += 1
            for bag in cluster_bags:
                bag.update({
                    "Reservation_ID": res_id,
                    "Handled_by_SLHS": handled,
                    "Claim_Option": claim,
                    "Delivery_Address": address,
                    "Bag_Assignment_Status": "Assigned"
                })
                data.append(bag.copy())

    df = pd.DataFrame(data)
    df = df.drop(columns=['Bag_Assignment_Status'], errors='ignore') # Remove temporary column
    return df


# ---  Interactive Widgets ---
luggage_distribution = widgets.Dropdown(
    options=["small", "large", "equal"],
    value="equal",
    description="Distribution:"
)

small_range = widgets.IntRangeSlider(
    value=[2, 3],
    min=1,
    max=20,
    description="Small Range:",
    continuous_update=False
)

large_range = widgets.IntRangeSlider(
    value=[4, 5],
    min=1,
    max=20,
    description="Large Range:",
    continuous_update=False
)

equal_range = widgets.IntRangeSlider(
    value=[2, 7],
    min=1,
    max=20,
    description="Equal Range:",
    continuous_update=False
)

capacity_dropdown = widgets.Dropdown(
    options=[128, 250, 300, 350],
    value=128,
    description="Passenger Capacity:"
)

slhs_percent = widgets.Dropdown(
    options=[0.25, 0.5, 0.75],
    value=0.5,
    description="SLHS %:"
)

percent_0_bag_slider = widgets.FloatSlider(
    value=10.0, min=0.0, max=100.0, step=1.0, description="% 0 Bag:"
)

percent_1_bag_slider = widgets.FloatSlider(
    value=20.0, min=0.0, max=100.0, step=1.0, description="% 1 Bag:"
)

percent_2_bag_slider = widgets.FloatSlider(
    value=70.0, min=0.0, max=100.0, step=1.0, description="% 2 Bag:"
)

generate_btn = widgets.Button(description="Generate Dataset")


def on_generate(b):
    """
    Event handler for the 'Generate Dataset' button click.

    Validates bag percentages, generates the dataset, saves it to CSV,
    displays cluster distribution statistics, and shows the DataFrame.
    """
    with output:
        clear_output(wait=True)

        total_percentage = percent_0_bag_slider.value + percent_1_bag_slider.value + percent_2_bag_slider.value
        if total_percentage > 100:
            print("Error: Total percentage of bags cannot exceed 100% (sum of all bags percentage). Please adjust the percentages.")
            return  # Prevent dataset generation if percentages are invalid

        print("🔄 Generating dataset...")
        df = generate_dataset(
            capacity_dropdown.value,
            luggage_distribution.value,
            slhs_percent.value,
            small_range.value,
            large_range.value,
            equal_range.value,
            percent_0_bag_slider.value,
            percent_1_bag_slider.value,
            percent_2_bag_slider.value
        )
        df.to_csv(file_path, index=False)
        print(f"✅ Saved to: {file_path}")

        # --- Display Cluster Distribution with Percentages ---
        print("\nCluster Distribution per Flight:")
        for flight in df['Flight_Number'].unique():
            flight_df = df[df['Flight_Number'] == flight]
            reservation_ids = flight_df['Reservation_ID'].dropna().unique()
            cluster_counts = {}
            for res_id in reservation_ids:
                cluster_size = int(res_id[-2:])
                cluster_counts[cluster_size] = cluster_counts.get(cluster_size, 0) + 1

            print(f"Flight {flight}:")
            total_clusters = sum(cluster_counts.values())
            for size in sorted(cluster_counts.keys()):
                count = cluster_counts[size]
                percentage = (count / total_clusters) * 100 if total_clusters > 0 else 0.0
                print(f"  Cluster size {size}: {count} \t| Percentage: {percentage:.1f}%")
            print("-" * 30)

        # --- Display Passenger Bag Counts ---
        people_0 = df.loc[df['Number_of_Bags'] == 0, 'Phone_Number'].nunique()
        people_1 = df.loc[df['Number_of_Bags'] == 1, 'Phone_Number'].nunique()
        people_2 = df.loc[df['Number_of_Bags'] == 2, 'Phone_Number'].nunique()
        print(f"\nNumber of passengers with 0 bags: {people_0}")
        print(f"Number of passengers with 1 bag:  {people_1}")
        print(f"Number of passengers with 2 bags: {people_2}")
        print(f"\nTotal bags generated: {df[df['Bag_ID'].notna()].shape[0]} (Total Bag_ID generated)")

        display(df)


generate_btn.on_click(on_generate)

# --- Display Widgets  ---
display(
    luggage_distribution, small_range, large_range, equal_range,
    capacity_dropdown, slhs_percent,
    percent_0_bag_slider, percent_1_bag_slider, percent_2_bag_slider,
    generate_btn, output
)

Dropdown(description='Distribution:', index=2, options=('small', 'large', 'equal'), value='equal')

IntRangeSlider(value=(2, 3), continuous_update=False, description='Small Range:', max=20, min=1)

IntRangeSlider(value=(4, 5), continuous_update=False, description='Large Range:', max=20, min=1)

IntRangeSlider(value=(2, 7), continuous_update=False, description='Equal Range:', max=20, min=1)

Dropdown(description='Passenger Capacity:', options=(128, 250, 300, 350), value=128)

Dropdown(description='SLHS %:', index=1, options=(0.25, 0.5, 0.75), value=0.5)

FloatSlider(value=10.0, description='% 0 Bag:', step=1.0)

FloatSlider(value=20.0, description='% 1 Bag:', step=1.0)

FloatSlider(value=70.0, description='% 2 Bag:', step=1.0)

Button(description='Generate Dataset', style=ButtonStyle())

Output()