In [12]:
import random
import string
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
from faker import Faker
from collections import Counter
from datetime import datetime, timedelta

In [14]:
fake = Faker()

# Configuration Dictionaries
total_bags_per_flight = {
    128: 300,
    250: 650,
    300: 780,
    350: 870
}

airline_country_codes = {
    "176": "+971",  # UAE
    "001": "+1",    # USA
    "125": "+44",   # UK
    "131": "+81",   # Japan
    "065": "+966"   # Saudi Arabia
}

# File path for saving output
output = widgets.Output()
desktop_path = os.path.expanduser("~/Desktop")
os.makedirs(desktop_path, exist_ok=True)
file_path = os.path.join(desktop_path, "luggage_dataset3.csv")


def generate_probabilities(luggage_dist, small_rng, large_rng, equal_rng):
    probabilities = {}
    sizes = []
    
    if luggage_dist == "small":
        main_sizes = list(range(small_rng[0], small_rng[1] + 1))  # 1–4
        few_sizes = list(range(large_rng[0], large_rng[1] + 1))  # 5–7

        total_main = len(main_sizes)
        total_few = len(few_sizes)

        main_weight = 0.75  # 75% of the total weight
        few_weight = 0.25   # 25% for few large clusters

        for size in main_sizes:
            probabilities[size] = main_weight / total_main
        for size in few_sizes:
            probabilities[size] = few_weight / total_few

    elif luggage_dist == "large":
        main_sizes = list(range(large_rng[0], large_rng[1] + 1))  # 5–7
        few_sizes = list(range(small_rng[0], small_rng[1] + 1))  # 1–4

        total_main = len(main_sizes)
        total_few = len(few_sizes)

        main_weight = 0.75
        few_weight = 0.25

        for size in main_sizes:
            probabilities[size] = main_weight / total_main
        for size in few_sizes:
            probabilities[size] = few_weight / total_few
                
    elif luggage_dist == "equal":
        # Equal distribution across 1-7 only (for main probability)
        base_sizes = list(range(equal_rng[0], equal_rng[1] + 1))  # Typically 1 to 7
        base_prob = 0.99 / len(base_sizes)  # Reserve 1% total probability for sizes 8–10
        
        # Assign equal prob to 1–7
        for size in base_sizes:
            probabilities[size] = base_prob
        
        # Assign small probabilities to 8–10 (e.g., 0.01 each)
        for rare_size in [8, 9, 10]:
            probabilities[rare_size] = 0.01
  
    return probabilities

def generate_phone_number(country_code):
    
    if country_code == "+1":
        area_code = random.randint(200, 999)
        exchange_code = random.randint(200, 999)
        subscriber_number = random.randint(1000, 9999)
        return f"{country_code}{area_code}{exchange_code}{subscriber_number}"
        
    elif country_code == "+971":
        return f"{country_code}{random.randint(500000000, 599999999)}"
        
    elif country_code == "+44":
        return f"{country_code}{random.randint(7000000000, 7999999999)}"
        
    elif country_code == "+81":
        return f"{country_code}{random.randint(800000000, 9999999999)}"
        
    elif country_code == "+966":
        return f"{country_code}{random.randint(500000000, 599999999)}"



def generate_dataset(passenger_capacity, luggage_dist, slhs_percent, small_rng, large_rng, equal_rng):
    
    airlines = ["176", "001", "125", "131", "065"]
    iata_codes = {"176": "EK", "001": "AA", "125": "BA", "131": "JL", "065": "SV"}

    random.shuffle(airlines)
    flight_numbers = [f"{iata_codes[airline]}{random.randint(1000,9999)}" for airline in airlines[:3]]

    # Set first flight's arrival randomly, then space others by 13 minutes
    base_time = datetime.strptime(f"{random.randint(0, 23):02d}:{random.randint(0, 59):02d}", "%H:%M")
    flight_arrival_times = {}
    
    for idx, flight in enumerate(flight_numbers):
        arrival_time = base_time + timedelta(minutes=13 * idx)
        flight_arrival_times[flight] = arrival_time.strftime("%H:%M")

    
    bags = []
    bag_counter = 1
    
    target_bag_count = 3 * total_bags_per_flight[passenger_capacity]
    buffer_factor = 1.2  # 20% overgeneration
    bag_limit = int(target_bag_count * buffer_factor)
    
    for flight_number in flight_numbers:
        airline_code = airlines[flight_numbers.index(flight_number) % len(airlines)]
        for _ in range(passenger_capacity):
            bag_id = f"{airline_code}-{flight_number}-{bag_counter:06d}"
            bags.append((flight_number, bag_id))
            bag_counter += 1
    
    # Add more random bags until we reach bag_limit
    while len(bags) < bag_limit:
        flight_number = random.choice(flight_numbers)
        airline_code = airlines[flight_numbers.index(flight_number) % len(airlines)]
        bag_id = f"{airline_code}-{flight_number}-{bag_counter:06d}"
        bags.append((flight_number, bag_id))
        bag_counter += 1

    # Get probabilities for luggage distribution
    probabilities = generate_probabilities(luggage_dist, small_rng, large_rng, equal_rng)

    data = []
    reservation_counter = 1
    i = 0

    while i < len(bags):
        cluster_size = random.choices(list(probabilities.keys()), list(probabilities.values()))[0]
        if i + cluster_size > len(bags):
            cluster_size = len(bags) - i

        letters = string.ascii_letters  # A-Z, a-z
        digits = string.digits  # 0-9
        symbols = string.punctuation  # Special characters

        # Generate password (at least 8 characters, with 1 number & 1 symbol)
        password = random.choice(digits) + random.choice(symbols) + "".join(random.choices(letters + digits + symbols, k=6))

        # Shuffle the characters for randomness
        password = "".join(random.sample(password, len(password)))
    
        res_id = f"RES-{reservation_counter:05d}"

        num_phones = random.choices([1, 2], weights=[0.7, 0.3])[0]
    
        phones = []
        for _ in range(num_phones):
            # Randomly choose a country code based on the airline
            airline_code = bags[i][0][:3]  # Get the airline code from the bag's flight number (first 3 chars)
            country_code = airline_country_codes.get(airline_code, "+1")  # Default to USA if no match
            
            # Generate a valid phone number based on the country code
            phone_number = generate_phone_number(country_code)
            phones.append(phone_number)
        
        flight_number = bags[i][0] 
        pickup_time = datetime.strptime(flight_arrival_times[flight_number], "%H:%M") + timedelta(minutes=random.randint(0, 25))
        pickup_time = pickup_time.strftime("%H:%M")
        
        handled = random.random() < slhs_percent
        claim = "SLHS Electronic Gates" if handled else random.choice(["Delivery", "Pickup Outside the Terminal"])
        address = fake.address() if claim == "Delivery" else None

        last_name = fake.last_name()
    
        for j in range(cluster_size):
            _, bag_id = bags[i + j]
            phone = random.choice(phones)
            data.append({
                "Bag_ID": bag_id,
                "Claim_Option": claim,
                "Handled_by_SLHS": handled,
                "Reservation_ID": res_id,
                "Phone_Number": phone,
                "Last_Name": last_name,
                "Password": password,
                "Flight_Number": flight_number,
                "Flight_Arrival_Time": flight_arrival_times[flight_number],
                "Cluster_Size": cluster_size,
                "Delivery_Address": address,
                "Estimated_Arrival_Time": None,
                "Actual_Arrival_Time": None,
                "Pickup_Gate": None,
                "Luggage_Status": "In-transit",
                "Pickup_Time": pickup_time
            })

        i += cluster_size
        reservation_counter += 1


    df = pd.DataFrame(data)
    
    final_rows = []
    current_count = 0
    for _, group in df.groupby('Reservation_ID'):
        if current_count + len(group) > target_bag_count:
            break
        final_rows.append(group)
        current_count += len(group)

    df = pd.concat(final_rows).reset_index(drop=True)
    return df

In [16]:
# --- Widgets ---
capacity_dropdown = widgets.Dropdown(
    options=[128, 250, 300, 350],
    value=128,
    description="Passenger Capacity:"
)

luggage_dist_dropdown = widgets.Dropdown(
    options=["small", "large", "equal"],
    value="small",
    description="Luggage Distribution:"
)

slhs_percent_dropdown = widgets.Dropdown(
    options=[0.25, 0.5, 0.75],
    value=0.5,
    description="SLHS %:"
)

generate_btn = widgets.Button(description="Generate Dataset")
output = widgets.Output()

In [18]:
def on_generate(b):

    with output:
        clear_output(wait=True)

        print("Generating dataset...")
        df = generate_dataset(
            capacity_dropdown.value,
            luggage_dist_dropdown.value,
            slhs_percent_dropdown.value,
            small_rng=(1, 4),
            large_rng=(5, 7),
            equal_rng=(1, 7)
        )
        file_path = "luggage_dataset3.csv"
        df.to_csv(file_path, index=False)
        print(f"Saved to: {file_path}")

        # --- Display Cluster Distribution ---
        print("\nCluster Distribution per Flight:")
        for flight in df['Flight_Number'].unique():
            flight_df = df[df['Flight_Number'] == flight]
            cluster_counts = flight_df.groupby("Reservation_ID").size().value_counts().sort_index()
            
            print(f"Flight {flight}:")
            total_clusters = cluster_counts.sum()
            for size, count in cluster_counts.items():
                percentage = (count / total_clusters) * 100 if total_clusters > 0 else 0.0
                print(f"Cluster size {size}: {count} ({percentage:.1f}%)")
            print("-" * 30)

        print(f"\nTotal bags generated: {df.shape[0]} (Total Bag_ID generated)")
        display(df)

generate_btn.on_click(on_generate)

# --- Display Widgets  ---
display(
    capacity_dropdown, luggage_dist_dropdown, slhs_percent_dropdown, generate_btn, output
)

Dropdown(description='Passenger Capacity:', options=(128, 250, 300, 350), value=128)

Dropdown(description='Luggage Distribution:', options=('small', 'large', 'equal'), value='small')

Dropdown(description='SLHS %:', index=1, options=(0.25, 0.5, 0.75), value=0.5)

Button(description='Generate Dataset', style=ButtonStyle())

Output()