In [2]:
!pip install faker
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Collecting faker
  Downloading Faker-30.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.1.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.1.0
Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [3]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum

spark = SparkSession.builder.appName("WebServerLogAnalysis").getOrCreate()

df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)

# 1. Топ 10 самых активных IP
top_ips = df.groupBy("ip").count().orderBy(col("count").desc()).limit(10)
print("10 самых активных IP:")
top_ips.show()

# 2. Количество запросов по HTTP-методам
method_counts = df.groupBy("method").count()
print("\nКоличество запросов по методам:")
method_counts.show()

# 3. Количество запросов с кодом ответа 404
error_404_count = df.filter(col("response_code") == 404).count()
print("\nКоличество запросов с кодом ответа 404:")
print(error_404_count)

# 4. Суммарный размер ответов по датам
df = df.withColumn("date", col("timestamp").substr(1, 10))
date_response_size = df.groupBy("date").agg(sum("response_size").alias("total_response_size")).orderBy("date")
print("\nСуммарный размер ответов по датам:")
date_response_size.show();

spark.stop()


Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=4ec0cbf8f416c8367326dcc32f0d0a01e90100fd5032cdd43fd7e4965ce2876a
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3
10 самых активных IP:
+--------------+-----+
|            ip|count|
+--------------+-----+
| 191.88.238.17|    2|
| 213.210.92.52|    1|
|   75.253.52.7|    1|
| 60.242.212.54|    1|
|175.213.69.158|    1|
| 125.163.45.50|    1|
|145.23.202.127|    1|
|    5.51.203.7|    1|
