In [1]:
import csv

data = [
    ("title", "author", "genre", "sales", "year"),
    ("1984", "George Orwell", "Science Fiction", 5000, 1949),
    ("The Lord of the Rings", "J.R.R. Tolkien", "Fantasy", 3000, 1954),
    ("To Kill a Mockingbird", "Harper Lee", "Southern Gothic", 4000, 1960),
    ("The Catcher in the Rye", "J.D. Salinger", "Novel", 2000, 1951),
    ("The Great Gatsby", "F. Scott Fitzgerald", "Novel", 4500, 1925),
]
with open("data.csv", "w", encoding="UTF-8", newline="") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(data)

In [2]:
from pyspark.sql.functions import col, sum
from pyspark.sql import SparkSession

spark = (SparkSession.builder.master("local[*]").appName("Seminar 5").getOrCreate())

# — Используя Spark прочитайте данные из файла csv.
df = spark.read.option("header", "true").csv("data.csv")
df.show()

+--------------------+-------------------+---------------+-----+----+
|               title|             author|          genre|sales|year|
+--------------------+-------------------+---------------+-----+----+
|                1984|      George Orwell|Science Fiction| 5000|1949|
|The Lord of the R...|     J.R.R. Tolkien|        Fantasy| 3000|1954|
|To Kill a Mocking...|         Harper Lee|Southern Gothic| 4000|1960|
|The Catcher in th...|      J.D. Salinger|          Novel| 2000|1951|
|    The Great Gatsby|F. Scott Fitzgerald|          Novel| 4500|1925|
+--------------------+-------------------+---------------+-----+----+



In [3]:
# — Фильтруйте данные, чтобы оставить только книги, продажи которых превышают 3000 экземпляров.
df = df.withColumn("sales", df["sales"].cast("int"))
df = df.withColumn("year", df["year"].cast("int"))
filtered_sales = df.filter(col("sales") > 3000)
filtered_sales.show()

+--------------------+-------------------+---------------+-----+----+
|               title|             author|          genre|sales|year|
+--------------------+-------------------+---------------+-----+----+
|                1984|      George Orwell|Science Fiction| 5000|1949|
|To Kill a Mocking...|         Harper Lee|Southern Gothic| 4000|1960|
|    The Great Gatsby|F. Scott Fitzgerald|          Novel| 4500|1925|
+--------------------+-------------------+---------------+-----+----+



In [4]:
# — Сгруппируйте данные по жанру и вычислите общий объем продаж для каждого жанра.
grouped_genre = df.groupBy(col("genre")).agg(sum(col("sales")).alias("count_sales"))
grouped_genre.show()

+---------------+-----------+
|          genre|count_sales|
+---------------+-----------+
|Southern Gothic|       4000|
|          Novel|       6500|
|        Fantasy|       3000|
|Science Fiction|       5000|
+---------------+-----------+



In [5]:
# — Отсортируйте данные по общему объему продаж в порядке убывания.
grouped_genre.orderBy(col("count_sales").desc()).show()

+---------------+-----------+
|          genre|count_sales|
+---------------+-----------+
|          Novel|       6500|
|Science Fiction|       5000|
|Southern Gothic|       4000|
|        Fantasy|       3000|
+---------------+-----------+



In [6]:
spark.stop()