In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Создание SparkSession
spark = (
    SparkSession.builder
        .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.5")
        .appName("Read")
        .getOrCreate()
)
# Чтение CSV-файла
df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)

# Печать схемы DataFrame
df.printSchema()

# Показ первых 5 строк
df.show(5)

In [None]:
print("=== Top 10 active IP addresses: ===")
top_10_ip = (df.groupBy("ip")
             .count()
             .withColumnRenamed("count", "request_count")
             .orderBy(F.desc("request_count"))
             .limit(10))
top_10_ip.show(truncate=False)

In [None]:
print("=== Request count by HTTP method: ===")
method_counts = (df.groupBy("method")
                 .count()
                 .withColumnRenamed("count", "method_count")
                 .orderBy(F.desc("method_count")))
method_counts.show(truncate=False)

In [None]:
print("=== Number of 404 response codes: ===")
error_404_count = df.filter(F.col("response_code") == 404).count()
print(f"Number of 404 response codes: {error_404_count}")

In [None]:
print("=== Total response size by day: ===")
date_size_sum = (df.groupBy("timestamp")
                 .agg(F.sum("response_size").alias("total_response_size"))
                 .withColumnRenamed ("timestamp","date")
                 .orderBy("timestamp"))
date_size_sum.show(truncate=False)