In [1]:
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = SparkSession.builder.appName("analyze_logs").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/03 12:29:56 WARN Utils: Your hostname, Evgeniys-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.50.253 instead (on interface en0)
25/12/03 12:29:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/ekrasnikov/spark-4.0.1/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/ekrasnikov/.ivy2.5.2/cache
The jars for the packages stored in: /Users/ekrasnikov/.ivy2.5.2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-78a2b2e9-e7c1-4271-98f8-2c56b434d378;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.367 in 

In [3]:
logs_df = spark.read.csv("./web_server_logs.csv", header=True, inferSchema=True)

In [18]:
# 10 самых активных IP адресов(по количеству запросов)
top_most_active_ip_df = logs_df\
    .groupBy(F.col("ip"))\
    .agg(F.count("url").alias("request_count"))\
    .orderBy(F.col("request_count").desc())\
    .limit(10)

# Количество запросов по каждому методу
requests_by_http_method_df = logs_df\
    .groupBy(F.col("method"))\
    .agg(F.count("ip").alias("method_count"))\
    .orderBy(F.col("method_count"))

# Количество запросов с ответом 404
not_found_response_count = logs_df.filter(logs_df["response_code"] == 404).count()

# Сумма размера ответов по дате
total_response_size_df = logs_df\
    .select(F.to_date(F.col("timestamp"), "YYYY-MM-DD").alias("date"), F.col("response_size"))\
    .groupBy(F.col("date"))\
    .agg(F.sum(F.col("response_size")).alias("total_response_size"))\
    .orderBy(F.col("date").asc())

In [19]:
print("Top 10 active IP addresses:")
top_most_active_ip_df.show()

print("Request count by HTTP method:")
requests_by_http_method_df.show()

print(f"Number of 404 response codes: {not_found_response_count}")

print("Total response size by day:")
total_response_size_df.show()

Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
| 41.216.176.155|            1|
| 104.202.235.12|            1|
| 171.213.134.54|            1|
|  149.60.230.19|            1|
|  167.42.235.59|            1|
| 170.115.82.194|            1|
| 197.136.208.66|            1|
| 71.233.209.211|            1|
|101.196.249.202|            1|
|175.167.219.201|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|DELETE|       24866|
|   PUT|       24912|
|   GET|       25050|
|  POST|       25172|
+------+------------+

Number of 404 response codes: 24809
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|            1504797|
|2025-01-02|            1566745|
|2025-01-03|            1276193|
|2025-01-04|            1523680|
|2025-01-05|         

In [20]:
spark.stop()