In [1]:
!pip install faker



In [2]:
import os
import json
import random
import datetime
import numpy as np
from collections import defaultdict
from pathlib import Path

# Constants
N = 100  # Number of JSON files
M_MIN = 50  # Minimum number of records per file
M_MAX = 100  # Maximum number of records per file
K_MIN = 100  # Minimum number of cities
K_MAX = 200  # Maximum number of cities
NULL_PROBABILITY = 0.005  # Probability of NULL in a flight record property
INITIAL_PASSENGERS = 10000  # Initial number of passengers per city to prevent negative values

# Generate random cities
cities = [f"City_{i}" for i in range(random.randint(K_MIN, K_MAX))]

# Create output directory
output_dir = "/tmp/flights"
os.makedirs(output_dir, exist_ok=True)

# Flight data generation
def generate_random_flight():
    """Generate a random flight record with some fields potentially set to None."""
    def maybe_null(value):
        return value if random.random() > NULL_PROBABILITY else None

    date = datetime.datetime.now().date().isoformat()
    origin_city = random.choice(cities)
    destination_city = random.choice([city for city in cities if city != origin_city])
    flight_duration_secs = maybe_null(random.randint(1800, 7200))  # 30 min to 2 hours
    passengers_on_board = maybe_null(random.randint(10, 300))

    return {
        "date": maybe_null(date),
        "origin_city": maybe_null(origin_city),
        "destination_city": maybe_null(destination_city),
        "flight_duration_secs": flight_duration_secs,
        "# of passengers on board": passengers_on_board
    }

def generate_json_files():
    """Generate JSON files with random flight records."""
    for i in range(N):
        num_records = random.randint(M_MIN, M_MAX)
        flights = [generate_random_flight() for _ in range(num_records)]
        file_name = f"flights_data_{i}.json"
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, "w") as file:
            json.dump(flights, file, indent=4)

generate_json_files()

# Flight data analysis
def analyze_files(directory):
    start_time = datetime.datetime.now()
    
    total_records = 0
    dirty_records = 0
    destination_city_stats = defaultdict(list)
    passenger_stats = defaultdict(lambda: INITIAL_PASSENGERS)
    
    for file_path in Path(directory).rglob('*.json'):
        with open(file_path, 'r') as file:
            flights = json.load(file)
            for flight in flights:
                if None in flight.values():
                    dirty_records += 1
                    continue
                total_records += 1

                destination_city = flight['destination_city']
                flight_duration = flight['flight_duration_secs']
                passengers = flight['# of passengers on board']

                if flight_duration is not None:
                    destination_city_stats[destination_city].append(flight_duration)

                if passengers is not None:
                    origin_city = flight['origin_city']
                    # Ensure passenger changes don't result in negative values
                    if passenger_stats[origin_city] - passengers >= 0:
                        passenger_stats[origin_city] -= passengers
                    passenger_stats[destination_city] += passengers

    top_25_destinations = sorted(destination_city_stats, key=lambda x: len(destination_city_stats[x]), reverse=True)[:25]
    avg_flight_duration = {city: np.mean(destination_city_stats[city]) for city in top_25_destinations}
    p95_flight_duration = {city: np.percentile(destination_city_stats[city], 95) for city in top_25_destinations}

    max_passengers_arrived = max(passenger_stats.items(), key=lambda x: x[1])
    max_passengers_left = min(passenger_stats.items(), key=lambda x: x[1])

    duration = datetime.datetime.now() - start_time

    result = {
        "total_records": total_records,
        "dirty_records": dirty_records,
        "run_duration": duration.total_seconds(),
        "avg_flight_duration": avg_flight_duration,
        "p95_flight_duration": p95_flight_duration,
        "max_passengers_arrived": max_passengers_arrived,
        "max_passengers_left": max_passengers_left
    }
    
    return result

analysis_result = analyze_files(output_dir)
print(json.dumps(analysis_result, indent=4))


{
    "total_records": 53375,
    "dirty_records": 420,
    "run_duration": 0.1578,
    "avg_flight_duration": {
        "South John": 4614.80981595092,
        "New Anthonyberg": 4618.308641975309,
        "Wilsonborough": 4671.267080745341,
        "East Sarah": 4317.691823899371,
        "South Sarah": 4392.789473684211,
        "South Julie": 4555.4697986577185,
        "Davidside": 4548.872483221477,
        "Port Joseph": 4351.884353741497,
        "Jamesmouth": 4669.0625,
        "North Joseph": 4540.909090909091,
        "Port Phillip": 4210.7464788732395,
        "West Michael": 4635.642857142857,
        "Heathershire": 4809.86231884058,
        "Johnport": 4688.4525547445255,
        "Emilyland": 4365.488888888889,
        "East Christopher": 4409.104477611941,
        "North Jeffrey": 4510.723880597015,
        "Jonathanmouth": 4574.6171875,
        "Westview": 4243.569105691057,
        "Lewisfort": 4611.367346938776,
        "Jenniferchester": 4457.118279569892,
        "