In [15]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [16]:
spark = SparkSession.builder.appName("weather").getOrCreate()

In [52]:
df = spark.read.csv("./weather_data.csv", header=True, inferSchema=True)

In [None]:
# Обработка данных

In [55]:
df = df.withColumn("date", F.to_date(F.col("date"), "YYYY-MM-DD"))

columns = ["temperature", "precipitation", "wind_speed"]
agg_exprs = [F.mean(column).alias(f"{column}_mean") for column in columns]
station_means = df.groupBy("station_id").agg(*agg_exprs)
df = df.join(station_means, on="station_id", how="left")

for column in columns:
    df = df\
        .withColumn(column, F.when(F.col(column).cast("string") == "", None)\
        .otherwise(F.col(column)))\
        .withColumn(column, F.coalesce(F.col(column), F.col(f"{column}_mean")))\
        .drop(f"{column}_mean")

In [None]:
# Топ 5 самых жарких дней

In [62]:
top_hottest_days_df = df.select(F.col("date"), F.col("temperature")).orderBy(F.col("temperature").desc()).limit(5)
top_hottest_days_df.show()

+----------+------------------+
|      date|       temperature|
+----------+------------------+
|2021-08-20|39.982828249354846|
|2023-12-02| 39.96797489293784|
|2022-03-28|  39.8246894248997|
|2019-02-11| 39.76737697836647|
|2020-06-10| 39.69147838355929|
+----------+------------------+



In [None]:
# Станция с самым высоким уровнем влажности за последний год

In [80]:
last_year_df = df.select(F.max(F.year("date")).alias("max_year"))
last_year = last_year_df.collect()[0]["max_year"]
station_with_most_precipitation = df\
    .filter(F.year(F.col("date")) == last_year)\
    .groupBy("station_id")\
    .agg(F.sum("precipitation").alias("total_precipitation"))\
    .orderBy(F.col("total_precipitation").desc())\
    .limit(1)
station_with_most_precipitation.show()

+----------+-------------------+
|station_id|total_precipitation|
+----------+-------------------+
| station_5|  642.9302626767898|
+----------+-------------------+



In [81]:
# Средняя температура по месяцам, за все время наблюдений

In [91]:
avg_temperatures_by_month = df\
    .select(F.month(F.col("date")).alias("month"), F.col("temperature"))\
    .groupBy(F.col("month"))\
    .agg(F.mean("temperature").alias("avg_temperature"))\
    .orderBy(F.col("month").asc())
avg_temperatures_by_month.show()

+-----+------------------+
|month|   avg_temperature|
+-----+------------------+
|    1|11.356518462550754|
|    2| 9.067229891101926|
|    3| 7.244080205633994|
|    4|12.024529009744693|
|    5| 9.902883346912718|
|    6|13.421092297254138|
|    7|6.1857183016954576|
|    8|  10.9678002814186|
|    9| 9.596744236573942|
|   10|  9.09884344821895|
|   11| 7.265889994697494|
|   12|11.218592100674337|
+-----+------------------+



In [None]:
spark.stop()