In [0]:
# %run ../00_config

In [0]:
print("フライト予約履歴CSVファイルを作成します...")

In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col
import random, datetime, math

# ============================================================
# テーブル: flight_booking（フライト予約履歴）サンプル
# ============================================================
# flight_booking  サンプルデータ生成（シンプル版）
#   * 便×出発日ごと     : 30–40 予約
#   * 会員ごとの搭乗回数: 1–5 回
#   * 総レコード数      : N_ROWS

# ---------- パラメータ ----------
N_USERS = 400        # 最初に用意しておく会員数（足りなければ自動で増える）
N_ROWS  = 10000      # 欲しい予約レコード数
MAX_PER_USER = 5     # 会員あたり最大搭乗回数
MIN_PER_USER = 1     # 会員あたり最小搭乗回数

airlines = ["JL", "NH", "GK", "MM", "BC"]
routes   = [("HND", "ITM"), ("HND", "CTS"), ("HND", "SFO"),
            ("LAX", "HND"), ("NRT", "SIN"), ("KIX", "NRT"),
            ("SYD", "HND"), ("CDG", "NRT"), ("HND", "OKA"), ("FUK", "HND")]

start_date = datetime.date(2024, 1, 1)
end_date   = datetime.date(2025, 12, 31)
date_span  = (end_date - start_date).days          # ≈ 730 日

# ---------- ランダムユーティリティ ----------
rand_date       = lambda: (start_date + datetime.timedelta(days=random.randint(0, date_span))).isoformat()
rand_booking_id = lambda: f"B{random.randint(0, 9_999_999):07d}"
rand_flight_id  = lambda: f"{random.choice(airlines)}{random.randint(1, 999):03d}"
rand_route      = lambda: random.choice(routes)
def rand_fare(o, d):
    domestic = {"HND","NRT","ITM","CTS","KIX","OKA","FUK"}
    low, high = ((8_000, 35_000) if (o in domestic and d in domestic) else (30_000, 180_000))
    return round(random.randint(low, high) / 100) * 100   # 100 円単位

# ---------- 会員プール作成 ― 1〜5 回ずつリスト化してシャッフル ----------
user_pool = []
uid = 1
while len(user_pool) < N_ROWS:
    # 必要になるまで新しい会員を追加し続ける
    repeats = random.randint(MIN_PER_USER, MAX_PER_USER)
    user_pool.extend([uid] * repeats)
    uid += 1
random.shuffle(user_pool)                       # 各予約に誰が乗るかランダム化
print(f"★会員総数: {uid-1:,} 名  (プール長 = {len(user_pool):,})")

# ---------- 便×出発日ペアを作りながら予約行を生成 ----------
rows, used_pairs = [], set()

while len(rows) < N_ROWS:
    flight_id, flight_date = rand_flight_id(), rand_date()
    pair = (flight_id, flight_date)
    if pair in used_pairs:                      # 同じ便×日付は 1 回だけ
        continue
    used_pairs.add(pair)

    bookings_here = min(random.randint(30, 40), N_ROWS - len(rows))
    for _ in range(bookings_here):
        uid = user_pool.pop()                   # プールから 1 名取り出す
        origin, dest = rand_route()
        rows.append(Row(
            booking_id  = rand_booking_id(),
            user_id     = uid,
            flight_id   = flight_id,
            route_id    = f"{origin}-{dest}",
            flight_date = flight_date,
            fare_amount = rand_fare(origin, dest)
        ))

# ---------- DataFrame 化 ----------
df = spark.createDataFrame(rows)\
         .withColumn("fare_amount", col("fare_amount").cast("decimal(10,2)"))

# ---------- 動作確認 ----------
# 総行数を出力
print("総行数:", df.count())

# 各ユーザーの最大搭乗回数 (should be ≤5)
print("会員ごとの搭乗回数上位10件（最大 5 回）")
df.groupBy("user_id").count().orderBy("count", ascending=False).show(10)

# データを確認
df.show(10, truncate=False)

# スキーマを表示
df.printSchema()

# 各便の日付ごとの予約人数（フライトIDごとに）
print("各便の日付ごとの予約人数（フライトIDごと）")
df.groupBy("flight_id", "flight_date").count() \
  .orderBy("flight_date", "count", ascending=[True, False]) \
  .show(truncate=False)

# 各ユーザーの購入回数分布（1〜5 回）
print("各ユーザーの購入回数分布（1〜5 回）")
df.groupBy("user_id").count() \
  .groupBy("count").count() \
  .orderBy("count", ascending=False) \
  .show()

# # ---------- Delta テーブル保存 ----------
# df.write.format("delta")\
#   .option("comment", "フライト予約履歴")\
#   .mode("overwrite")\
#   .saveAsTable(f"{MY_CATALOG}.{MY_SCHEMA}.bz_flight_booking_2")

# ---------- CSV出力 ----------
out_path = f"/Volumes/{MY_CATALOG}/{MY_SCHEMA}/{MY_VOLUME}/flight_booking/flight_booking.csv"
df.coalesce(1).toPandas().to_csv(out_path, index=False)
print(out_path)

In [0]:
# from pyspark.sql import Row
# from pyspark.sql.functions import col
# import random, datetime, math

# # ============================================================
# # テーブル: flight_booking（フライト予約履歴）サンプル
# # ============================================================

# # ---------- 設定 ----------
# N_USERS  = 400        # 初期ユニーク会員数
# N_ROWS   = 10000      # 生成するレコード数（航空券予約数）
# MAX_PER_USER = 5      # 各会員の最大搭乗回数
# MIN_PER_USER = 1      # 各会員の最小搭乗回数

# airlines = ["JL", "NH", "GK", "MM", "BC"]  # 航空会社コード
# routes = [
#     # 片方が羽田(HND)・成田(NRT)になる区間をいくつか用意
#     ("HND", "ITM"), ("HND", "CTS"), ("HND", "SFO"),
#     ("LAX", "HND"), ("NRT", "SIN"), ("KIX", "NRT"),
#     ("SYD", "HND"), ("CDG", "NRT"), ("HND", "OKA"),
#     ("FUK", "HND")
# ]

# # ---------- 期間 ----------
# start_date = datetime.date(2024, 1, 1)
# end_date   = datetime.date(2025, 12, 31)
# date_span  = (end_date - start_date).days    # 約 730 日

# # ---------- ヘルパー関数 ----------
# def rand_date()      -> str: return (start_date + datetime.timedelta(days=random.randint(0, date_span))).isoformat()
# def rand_booking_id()-> str: return f"B{random.randint(0, 9_999_999):07d}"
# def rand_flight_id() -> str: return f"{random.choice(airlines)}{random.randint(1, 999):03d}"
# def rand_route()     -> tuple[str, str]: return random.choice(routes)
# def rand_fare(o, d)  -> int:
#     domestic = {"HND","NRT","ITM","CTS","KIX","OKA","FUK"}
#     low, high = ((8_000, 35_000) if (o in domestic and d in domestic) else (30_000, 180_000))
#     return round(random.randint(low, high) / 100) * 100   # 100 円単位

# # ---------- 会員ごとの搭乗枠を決定（1〜5 回）/ 行数が不足したら「新しい会員」を追加して枠を足す ----------
# user_quota = {uid: random.randint(MIN_PER_USER, MAX_PER_USER)
#               for uid in range(1, N_USERS + 1)}

# def total_remaining() -> int:           # まだ発券していない搭乗枠の総数
#     return sum(user_quota.values())

# next_uid = N_USERS + 1
# while total_remaining() < N_ROWS:
#     # 不足分を埋めるために新規会員を作成（1〜5 回）
#     user_quota[next_uid] = random.randint(MIN_PER_USER, MAX_PER_USER)
#     next_uid += 1

# print(f"★会員総数 = {len(user_quota):,}, 残枠 = {total_remaining():,} 行（目標 {N_ROWS:,} 行）")

# # ---------- 便×出発日ごとの予約を作成（30〜40 名/便日） ----------
# rows             = []
# used_pairs       = set()                     # 生成済み (flight_id, flight_date)
# users_with_slots = [uid for uid, cnt in user_quota.items() if cnt > 0]

# while len(rows) < N_ROWS:
#     flight_id   = rand_flight_id()
#     flight_date = rand_date()
#     pair        = (flight_id, flight_date)
#     if pair in used_pairs:
#         continue
#     used_pairs.add(pair)

#     # この便×日付に割り当てる予約数（30〜40 名）
#     to_fill = random.randint(30, 40)
#     to_fill = min(to_fill, N_ROWS - len(rows))   # 行数超過を防止

#     filled = 0
#     while filled < to_fill and users_with_slots:
#         uid = random.choice(users_with_slots)

#         # 行追加
#         origin, dest = rand_route()
#         rows.append(Row(
#             booking_id  = rand_booking_id(),
#             user_id     = uid,
#             flight_id   = flight_id,
#             route_id    = f"{origin}-{dest}",
#             flight_date = flight_date,
#             fare_amount = rand_fare(origin, dest)
#         ))

#         # 残枠を減算
#         user_quota[uid] -= 1
#         filled += 1
#         if user_quota[uid] == 0:
#             users_with_slots.remove(uid)   # もう搭乗できない

# # ---------- DataFrame 化 ----------
# df = spark.createDataFrame(rows)
# df = df.withColumn("fare_amount", col("fare_amount").cast("decimal(10,2)"))

# # ---------- 動作確認 ----------
# # 総行数を出力
# print("総行数:", df.count())

# # 各ユーザーの最大搭乗回数 (should be ≤5)
# print("会員ごとの搭乗回数上位10件（最大 5 回）")
# df.groupBy("user_id").count().orderBy("count", ascending=False).show(10)

# # データを確認
# df.show(10, truncate=False)

# # スキーマを表示
# df.printSchema()

# # 各便の日付ごとの予約人数（フライトIDごとに）
# print("各便の日付ごとの予約人数（フライトIDごと）")
# df.groupBy("flight_id", "flight_date").count() \
#   .orderBy("flight_date", "count", ascending=[True, False]) \
#   .show(truncate=False)

# # 各ユーザーの購入回数分布（1〜5 回）
# print("各ユーザーの購入回数分布（1〜5 回）")
# df.groupBy("user_id").count() \
#   .groupBy("count").count() \
#   .orderBy("count", ascending=False) \
#   .show()

# # ---------- Delta テーブル保存 ----------
# df.write.format("delta")\
#   .option("comment", "フライト予約履歴")\
#   .mode("overwrite")\
#   .saveAsTable(f"{MY_CATALOG}.{MY_SCHEMA}.bz_flight_booking_2")

# # ---------- CSV出力 ----------
# out_path = f"/Volumes/{MY_CATALOG}/{MY_SCHEMA}/{MY_VOLUME}/flight_booking/flight_booking.csv"
# df.coalesce(1).toPandas().to_csv(out_path, index=False)
# print(out_path)

In [0]:
print("フライト予約履歴CSVファイルを作成しました！")