In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("books_and_authors").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/03 11:32:06 WARN Utils: Your hostname, Evgeniys-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.50.253 instead (on interface en0)
25/12/03 11:32:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/ekrasnikov/spark-4.0.1/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/ekrasnikov/.ivy2.5.2/cache
The jars for the packages stored in: /Users/ekrasnikov/.ivy2.5.2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-938455fe-4683-4f00-af95-946b90058106;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.367 in 

In [3]:
books_df = spark.read.csv("./books.csv", header=True, inferSchema=True)
authors_df = spark.read.csv("./authors.csv", header=True, inferSchema=True)

In [5]:
# Обработка данных

+---------+---------+----------+---------+
|author_id|     name|birth_date|  country|
+---------+---------+----------+---------+
|        1| Author_1|1960-12-31|    India|
|        2| Author_2|1965-12-31|   Canada|
|        3| Author_3|1970-12-31|      USA|
|        4| Author_4|1975-12-31|       UK|
|        5| Author_5|1980-12-31|      USA|
|        6| Author_6|1985-12-31|      USA|
|        7| Author_7|1990-12-31|      USA|
|        8| Author_8|1995-12-31|Australia|
|        9| Author_9|2000-12-31|Australia|
|       10|Author_10|2005-12-31|    India|
+---------+---------+----------+---------+



In [13]:
books_df = books_df.withColumn("publish_date", F.to_date(F.col("publish_date"), "YYYY-MM-DD"))
authors_df = authors_df.withColumn("birth_date", F.to_date(F.col("birth_date"), "YYYY-MM-DD"))
books_with_authors_df = books_df.join(authors_df, on="author_id", how="inner")

In [14]:
books_with_authors_df.show()

+---------+-------+-------+-----------+-----+------------+---------+----------+---------+
|author_id|book_id|  title|      genre|price|publish_date|     name|birth_date|  country|
+---------+-------+-------+-----------+-----+------------+---------+----------+---------+
|        2|      1| Book_1|    Mystery|73.57|  1980-12-31| Author_2|1965-12-31|   Canada|
|        1|      2| Book_2|Non-Fiction| 41.1|  1982-12-31| Author_1|1960-12-31|    India|
|       10|      3| Book_3|    Fiction|10.63|  1984-12-31|Author_10|2005-12-31|    India|
|        9|      4| Book_4|Non-Fiction|46.31|  1986-12-31| Author_9|2000-12-31|Australia|
|        7|      5| Book_5|    Science|31.13|  1988-12-31| Author_7|1990-12-31|      USA|
|        4|      6| Book_6|Non-Fiction| 83.7|  1990-12-31| Author_4|1975-12-31|       UK|
|        6|      7| Book_7|Non-Fiction|40.36|  1992-12-31| Author_6|1985-12-31|      USA|
|        2|      8| Book_8|Non-Fiction|84.48|  1994-12-31| Author_2|1965-12-31|   Canada|
|        7

In [11]:
# Топ-5 авторов, которые принесли наибольшую выручку

In [30]:
top_most_profit_authors_df = books_with_authors_df\
    .groupBy(F.col("author_id"))\
    .agg(F.sum("price").alias("total_revenue"))\
    .join(
        books_with_authors_df.select(F.col("author_id"), F.col("name")).distinct(),
        on="author_id",
        how="inner"
    )\
    .select("author_id", "name", "total_revenue")\
    .orderBy(F.col("total_revenue").desc())\
    .limit(5)
top_most_profit_authors_df.show()

+---------+--------+-------------+
|author_id|    name|total_revenue|
+---------+--------+-------------+
|        2|Author_2|       231.97|
|        7|Author_7|       132.66|
|        1|Author_1|       111.86|
|        8|Author_8|       107.16|
|        5|Author_5|        88.83|
+---------+--------+-------------+



In [28]:
# Количество книг в каждом жанре

In [32]:
books_by_genre_df = books_with_authors_df\
    .groupBy(F.col("genre"))\
    .agg(F.count("book_id").alias("books_count"))\
    .orderBy(F.col("books_count").desc())
books_by_genre_df.show()

+-----------+-----------+
|      genre|books_count|
+-----------+-----------+
|Non-Fiction|          9|
|    Science|          3|
|    Fiction|          3|
|    Fantasy|          3|
|    Mystery|          2|
+-----------+-----------+



In [33]:
# Средняя цена книги по каждому автору

In [36]:
avg_books_price_by_author_df = books_with_authors_df\
    .groupBy(F.col("author_id"))\
    .agg(F.mean("price").alias("avg_price"))\
    .join(
        books_with_authors_df.select(F.col("author_id"), F.col("name")).distinct(),
        on="author_id",
        how="inner",
    )\
    .select(F.col("author_id"), F.col("name"), F.col("avg_price"))\
    .orderBy(F.col("avg_price").desc())
avg_books_price_by_author_df.show()

+---------+---------+-----------------+
|author_id|     name|        avg_price|
+---------+---------+-----------------+
|        5| Author_5|            88.83|
|        4| Author_4|             83.7|
|        2| Author_2|          57.9925|
|        9| Author_9|            46.31|
|        7| Author_7|            44.22|
|        6| Author_6|           43.965|
|        1| Author_1|37.28666666666667|
|        8| Author_8|            35.72|
|       10|Author_10|           21.165|
+---------+---------+-----------------+



In [37]:
# Книги опубликованные после 2000 года

In [38]:
books_published_after_year_df = books_with_authors_df\
    .filter(F.year(F.col("publish_date")) > 2000)\
    .orderBy(F.col("price").desc())
books_published_after_year_df.show()

+---------+-------+-------+-----------+-----+------------+--------+----------+---------+
|author_id|book_id|  title|      genre|price|publish_date|    name|birth_date|  country|
+---------+-------+-------+-----------+-----+------------+--------+----------+---------+
|        7|     20|Book_20|    Mystery|91.48|  2018-12-31|Author_7|1990-12-31|      USA|
|        5|     19|Book_19|    Science|88.83|  2016-12-31|Author_5|1980-12-31|      USA|
|        8|     15|Book_15|    Fantasy| 60.0|  2008-12-31|Author_8|1995-12-31|Australia|
|        6|     17|Book_17|    Fantasy|47.57|  2012-12-31|Author_6|1985-12-31|      USA|
|        1|     18|Book_18|Non-Fiction|43.92|  2014-12-31|Author_1|1960-12-31|    India|
|        2|     16|Book_16|    Fiction|36.22|  2010-12-31|Author_2|1965-12-31|   Canada|
|        8|     12|Book_12|Non-Fiction|31.02|  2002-12-31|Author_8|1995-12-31|Australia|
|        1|     14|Book_14|    Fiction|26.84|  2006-12-31|Author_1|1960-12-31|    India|
|        8|     13|Bo

In [None]:
spark.stop()