## Как запустить:

```docker run -p 8888:8888 dimajix/jupyter-spark```

Далее открыть юпитер со спарком по адресу localhost:8888

Залить в корень данный ноутбук и csv-файл из репозитория.

In [67]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
import os

spark = SparkSession.builder.appName("Test")\
.getOrCreate()
test_df = spark.range(10)
test_df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



## Сделаем все необходимые таблицы:

In [37]:
# logs_hotel
logs_hotel = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(f"file://{os.getcwd()}/Hotel.csv")

logs_hotel.write.mode('overwrite').saveAsTable('logs_hotel')
logs_hotel.show(5)
logs_hotel.printSchema()





+--------+--------+----------+--------------+-----------+------------+-----------------+-----------+---------+----+-----+----+--------------+--------------+----------------------+------------------------------+--------------+----------------+------------+
|      ID|n_adults|n_children|weekend_nights|week_nights|   meal_plan|car_parking_space|  room_type|lead_time|year|month|date|market_segment|repeated_guest|previous_cancellations|previous_bookings_not_canceled|avg_room_price|special_requests|      status|
+--------+--------+----------+--------------+-----------+------------+-----------------+-----------+---------+----+-----+----+--------------+--------------+----------------------+------------------------------+--------------+----------------+------------+
|INN00001|       2|         0|             1|          2| Meal Plan 1|                0|Room_Type 1|      224|2017|   10|   2|       Offline|             0|                     0|                             0|          65.0|       

In [38]:
##calendar
calendar = spark.range(0, 731).withColumn("calendar_dt", expr("date_add(to_date('2017-01-01'), id)"))
calendar = calendar.withColumn("calendar_dt", date_format(calendar.calendar_dt, "yyyy-dd-MM")).drop("id")
calendar.createOrReplaceTempView("calendar")

calendar.show(32)


+-----------+
|calendar_dt|
+-----------+
| 2017-01-01|
| 2017-02-01|
| 2017-03-01|
| 2017-04-01|
| 2017-05-01|
| 2017-06-01|
| 2017-07-01|
| 2017-08-01|
| 2017-09-01|
| 2017-10-01|
| 2017-11-01|
| 2017-12-01|
| 2017-13-01|
| 2017-14-01|
| 2017-15-01|
| 2017-16-01|
| 2017-17-01|
| 2017-18-01|
| 2017-19-01|
| 2017-20-01|
| 2017-21-01|
| 2017-22-01|
| 2017-23-01|
| 2017-24-01|
| 2017-25-01|
| 2017-26-01|
| 2017-27-01|
| 2017-28-01|
| 2017-29-01|
| 2017-30-01|
| 2017-31-01|
| 2017-01-02|
+-----------+
only showing top 32 rows



## ЗАДАНИЕ 1 

In [None]:
logs_hotel_with_nights = logs_hotel.withColumn(
    "total_nights",
    F.col("weekend_nights") + F.col("week_nights")
)

confirmed_bookings = logs_hotel_with_nights.filter(
    F.col("status") == "Not_Canceled"
)

avg_nights_by_month = confirmed_bookings.groupBy("year", "month") \
    .agg(
        F.round(F.avg("total_nights"), 2).alias("avg_nights"),
        F.count("*").alias("booking_count")
    ) \
    .orderBy("year", "month")

avg_nights_by_month.show()

+----+-----+----------+-------------+
|year|month|avg_nights|booking_count|
+----+-----+----------+-------------+
|2017|    7|      3.02|          120|
|2017|    8|      2.72|          829|
|2017|    9|      2.66|         1467|
|2017|   10|       2.7|         1611|
|2017|   11|      2.72|          620|
|2017|   12|      3.04|          906|
|2018|    1|      2.74|          990|
|2018|    2|      2.69|         1274|
|2018|    3|      3.04|         1658|
|2018|    4|      2.92|         1741|
|2018|    5|      2.81|         1650|
|2018|    6|       2.6|         1912|
|2018|    7|      3.19|         1486|
|2018|    8|      3.15|         1496|
|2018|    9|      2.79|         1606|
|2018|   10|      2.89|         1826|
|2018|   11|      2.98|         1485|
|2018|   12|      3.25|         1713|
+----+-----+----------+-------------+



## ЗАДАНИЕ 2

In [45]:
bookings_2018 = logs_hotel.filter(F.col("year") == 2018)


monthly_stats = bookings_2018.groupBy("month") \
    .agg(
        F.count("*").alias("total_bookings"),  # все бронирования
        F.sum(F.when(F.col("status") != "Not_Canceled", 1).otherwise(0)).alias("cancelled_bookings")
    )

monthly_stats = monthly_stats.withColumn(
    "cancellation_rate_percent",
    F.round((F.col("cancelled_bookings") / F.col("total_bookings")) * 100, 2)
)

top_3_months = monthly_stats.orderBy(F.desc("cancellation_rate_percent")).limit(3)


# 5. Выводим результат
top_3_months.show()

+-----+--------------+------------------+-------------------------+
|month|total_bookings|cancelled_bookings|cancellation_rate_percent|
+-----+--------------+------------------+-------------------------+
|    8|          2799|              1303|                    46.55|
|   10|          3404|              1578|                    46.36|
|    9|          2962|              1356|                    45.78|
+-----+--------------+------------------+-------------------------+



## Задание 3

In [52]:
confirmed_bookings = logs_hotel.filter(F.col("status") == "Not_Canceled")


confirmed_bookings = confirmed_bookings.withColumn(
    "lead_time",
    F.col("lead_time").cast("integer")
)

monthly_lead_time = confirmed_bookings.groupBy("month") \
    .agg(
        F.round(F.avg("lead_time"), 2).alias("avg_lead_time_days"),
    ) \
    .orderBy("month")


monthly_lead_time.show()


+-----+------------------+
|month|avg_lead_time_days|
+-----+------------------+
|    1|             34.87|
|    2|             30.53|
|    3|             43.19|
|    4|             62.49|
|    5|             60.99|
|    6|             70.64|
|    7|             90.16|
|    8|             65.97|
|    9|             57.78|
|   10|             65.11|
|   11|             41.02|
|   12|             61.79|
+-----+------------------+



## ЗАДАНИЕ 4

In [53]:
confirmed_bookings = logs_hotel.filter(F.col("status") == "Not_Canceled")

confirmed_bookings = confirmed_bookings.withColumn(
    "avg_room_price",
    F.col("avg_room_price").cast("double")
)

monthly_revenue = confirmed_bookings.groupBy("year", "month", "market_segment") \
    .agg(
        F.round(F.avg("avg_room_price"), 2).alias("avg_revenue")
    )

pivot_table = monthly_revenue.groupBy("year", "month") \
    .pivot("market_segment") \
    .agg(F.first("avg_revenue"))

pivot_table = pivot_table.orderBy("year", "month")


pivot_table.show()


+----+-----+--------+-------------+---------+-------+------+
|year|month|Aviation|Complementary|Corporate|Offline|Online|
+----+-----+--------+-------------+---------+-------+------+
|2017|    7|    null|         7.47|     65.0|  88.19| 65.55|
|2017|    8|    null|         0.32|    66.92|  97.16| 97.86|
|2017|    9|    null|         7.37|    86.03|   96.8|113.33|
|2017|   10|    null|         0.55|    84.48|  89.08|101.97|
|2017|   11|    null|         4.94|    68.08|  66.18| 84.75|
|2017|   12|    null|         0.05|    69.03|  69.95| 87.31|
|2018|    1|    null|         1.73|    68.88|  73.97| 80.19|
|2018|    2|    88.0|         0.75|    74.98|  75.07|  87.5|
|2018|    3|    89.0|         16.2|    74.05|  72.64| 96.81|
|2018|    4|   97.57|          0.0|     80.0|  86.83|105.49|
|2018|    5|   101.0|          0.0|    100.9|  94.97|123.91|
|2018|    6|    95.0|          0.0|    84.83| 103.01| 119.3|
|2018|    7|    79.0|         2.69|    84.86|  87.78|123.71|
|2018|    8|    null|   

## ЗАДАНИЕ 5

In [54]:
regular_guests = logs_hotel.filter(F.col("repeated_guest") == 1)

regular_guests = regular_guests.withColumn(
    "total_nights",
    F.col("weekend_nights") + F.col("week_nights")
).withColumn(
    "booking_revenue",
    F.col("avg_room_price") * F.col("total_nights")
)

guest_revenue = regular_guests.groupBy("ID") \
    .agg(
        F.round(F.sum("booking_revenue"), 2).alias("total_revenue")
    )

top_5_guests = guest_revenue.orderBy(F.desc("total_revenue")).limit(5)


total_regular_revenue = regular_guests.agg(
    F.round(F.sum("booking_revenue"), 2).alias("total_regular_revenue")
).collect()[0]["total_regular_revenue"]

top_5_with_share = top_5_guests.withColumn(
    "revenue_share",
    F.round((F.col("total_revenue") / total_regular_revenue) * 100, 2)
)

top_5_with_share.show()


+--------+-------------+-------------+
|      ID|total_revenue|revenue_share|
+--------+-------------+-------------+
|INN19235|       1754.4|         1.51|
|INN05222|        690.0|         0.59|
|INN14189|        665.0|         0.57|
|INN09923|        660.0|         0.57|
|INN25479|        650.0|         0.56|
+--------+-------------+-------------+



## ЗАДАНИЕ 6

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

calendar_clean = calendar.withColumn(
    "calendar_dt",
    F.to_date("calendar_dt", "yyyy-dd-MM")
)

confirmed_bookings = logs_hotel.filter(F.col("status") == "Not_Canceled")

confirmed_bookings = confirmed_bookings.withColumn(
    "weekend_nights",
    F.col("weekend_nights").cast("integer")
).withColumn(
    "week_nights",
    F.col("week_nights").cast("integer")
).withColumn(
    "total_nights",
    F.coalesce(F.col("weekend_nights"), F.lit(0)) +
    F.coalesce(F.col("week_nights"), F.lit(0))
).withColumn(
    "total_guests",
    F.col("n_adults") + F.col("n_children")
)

confirmed_bookings = confirmed_bookings.withColumn(
    "date_parsed",
    F.to_date(
        F.concat(
            F.col("year"), 
            F.lit("-"),    
            F.col("month"),
            F.lit("-"),     
            F.col("date")   
        ),
        "yyyy-MM-dd"
    )
)

confirmed_bookings = confirmed_bookings.withColumn(
    "check_out_date",
    F.expr("date_add(date_parsed, total_nights)")
)

daily_occupancy = calendar_clean.join(
    confirmed_bookings,
    (calendar_clean.calendar_dt >= F.col("date_parsed")) &
    (calendar_clean.calendar_dt < F.col("check_out_date")),
    "left"
)

occupancy_by_day = daily_occupancy.groupBy("calendar_dt") \
    .agg(
        F.sum(F.coalesce(F.col("total_guests"), F.lit(0))).alias("guest_count")
    ) \
    .fillna(0, subset=["guest_count"])

occupancy_with_pct = occupancy_by_day.withColumn(
    "occupancy_pct",
    F.round((F.col("guest_count") / 400) * 100, 2)
)

result = occupancy_with_pct.orderBy(F.desc("calendar_dt"))


result.show()


+-----------+-----------+-------------+
|calendar_dt|guest_count|occupancy_pct|
+-----------+-----------+-------------+
| 2019-01-01|        403|       100.75|
| 2018-12-31|        562|        140.5|
| 2018-12-30|        572|        143.0|
| 2018-12-29|        542|        135.5|
| 2018-12-28|        507|       126.75|
| 2018-12-27|        552|        138.0|
| 2018-12-26|        422|        105.5|
| 2018-12-25|        397|        99.25|
| 2018-12-24|        373|        93.25|
| 2018-12-23|        341|        85.25|
| 2018-12-22|        282|         70.5|
| 2018-12-21|        247|        61.75|
| 2018-12-20|        240|         60.0|
| 2018-12-19|        228|         57.0|
| 2018-12-18|        258|         64.5|
| 2018-12-17|        274|         68.5|
| 2018-12-16|        254|         63.5|
| 2018-12-15|        170|         42.5|
| 2018-12-14|        155|        38.75|
| 2018-12-13|        153|        38.25|
+-----------+-----------+-------------+
only showing top 20 rows

