In [16]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DF").master("yarn").getOrCreate()

In [94]:
spark

### Но при этом из этого контекста не вызывается ID пиложения

In [18]:
spark.applicationId

AttributeError: 'SparkSession' object has no attribute 'applicationId'

In [19]:
sc.applicationId

'application_1750716381279_1632'

In [20]:
sc

---
### Загрузим данные

In [21]:
%%time
df = (spark.read.format("json").load("/data/yelp/review"))

                                                                                

CPU times: user 33.1 ms, sys: 21.9 ms, total: 54.9 ms
Wall time: 16.5 s


In [22]:
df.head()

Row(business_id='XQfwVwDr-v0ZS3_CbbE5Xw', cool=0, date='2018-07-07 22:09:11', funny=0, review_id='KU_O5udG6zpxOg-VcAEodg', stars=3.0, text="If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", useful=0, user_id='mh_-eMZ6K5RLWhZyISBhwA')

#### Можно посмотреть структуру

In [10]:
df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [13]:
df.show(1, vertical=True, truncate=False)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 business_id | XQfwVwDr-v0ZS3_CbbE5Xw                                                                                                                                                                                                                                                                                                                                                                                                                                               

### Аналог padndas.describe()

In [15]:
df.summary().show(10, truncate=False, vertical=True)

[Stage 10:>                                                         (0 + 1) / 1]

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

AttributeError: 'NoneType' object has no attribute 'cache'

---
### Переход от RDD к DataFrames

In [23]:
import json
rdd = sc.textFile("/data/yelp/review").map(lambda x: json.loads(x))

#### Превращаем rdd в таблицу

In [27]:
df_rdd = spark.createDataFrame(rdd)

In [28]:
df_rdd.show(1)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|   0|2018-07-07 22:09:11|    0|KU_O5udG6zpxOg-Vc...|  3.0|If you decide to ...|     0|mh_-eMZ6K5RLWhZyI...|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
only showing top 1 row



In [30]:
df_rdd.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



#### Аналог pandas.columns().to_list()

In [31]:
df_rdd.schema.fieldNames()

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

#### Слайс колонок

In [36]:
df_rdd.select(["business_id", "stars", "text"]).show(5, truncate=False, vertical=True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 business_id | XQfwVwDr-v0ZS3_CbbE5Xw                                                                                                                 

#### Вывод списка колонок через select

In [37]:
df_rdd.select(*df.schema.fieldNames())

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: double, text: string, useful: bigint, user_id: string]

---
### Способы фильтрации

In [38]:
df_rdd.where('stars == 3').show(5)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|   0|2018-07-07 22:09:11|    0|KU_O5udG6zpxOg-Vc...|  3.0|If you decide to ...|     0|mh_-eMZ6K5RLWhZyI...|
|YjUWPpI6HXG530lwP...|   0|2014-02-05 20:30:30|    0|saUsX_uimxRlCVr67...|  3.0|Family diner. Had...|     0|8g_iMtfSiwikVnbP2...|
|B5XSoSG3SfvQGtKEG...|   0|2016-03-30 22:46:33|    1|ZKvDG2sBvHVdF5oBN...|  3.0|This easter inste...|     1|wSTuiTk-sKNdcFypr...|
|gebiRewfieSdtt17P...|   0|2016-07-25 07:31:06|    0|pUycOfUwM8vqX7KjR...|  3.0|Had a party of 6 ...|     0|59MxRhNVhU9MYndMk...|
|ut6fi2W2YaipNOqvi...|   0|2014-11-12 14:12:20|    0|ZVvhc3Go7v5I8XTiV...|  3.0|Upland is 

In [39]:
df_rdd.filter(df_rdd.stars > 2).show(3)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|   0|2018-07-07 22:09:11|    0|KU_O5udG6zpxOg-Vc...|  3.0|If you decide to ...|     0|mh_-eMZ6K5RLWhZyI...|
|7ATYjTIgM3jUlt4UM...|   1|2012-01-03 15:28:18|    0|BiTunyQ73aT9WBnpR...|  5.0|I've taken a lot ...|     1|OyoGAe7OKpv6SyGZT...|
|YjUWPpI6HXG530lwP...|   0|2014-02-05 20:30:30|    0|saUsX_uimxRlCVr67...|  3.0|Family diner. Had...|     0|8g_iMtfSiwikVnbP2...|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
only showing top 3 rows



In [41]:
df_rdd.filter("stars > 4 AND text LIKE '%amazing%'").show(5)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|7ATYjTIgM3jUlt4UM...|   1|2012-01-03 15:28:18|    0|BiTunyQ73aT9WBnpR...|  5.0|I've taken a lot ...|     1|OyoGAe7OKpv6SyGZT...|
|LHSTtnW3YHCeUkRDG...|   0|2015-08-07 02:29:16|    0|_ZeMknuYdlQcUqng_...|  5.0|Amazingly amazing...|     2|yfFzsLmaWF2d4Sr0U...|
|SZU9c8V2GuREDN5Kg...|   0|2016-05-31 02:14:54|    0|4zopEEPqfwm-c_FNp...|  5.0|We were a bit wea...|     0|JYYYKt6TdVA4ng9lL...|
|EpREWeEpmR8f1qLHz...|   0|2011-11-30 06:58:36|    0|-up4mW6WdqzGrRh7t...|  5.0|After living in t...|     0|xbybLiQockAzC4xAl...|
|5Ce3lZksYVkCbrihq...|   0|2014-07-25 17:56:26|    0|ymhbOMW63B_vGaRFR...|  5.0|I just sta

In [42]:
df_rdd.filter((df_rdd.stars > 4) & (df_rdd.text.like("%amazing%"))).show(5)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|7ATYjTIgM3jUlt4UM...|   1|2012-01-03 15:28:18|    0|BiTunyQ73aT9WBnpR...|  5.0|I've taken a lot ...|     1|OyoGAe7OKpv6SyGZT...|
|LHSTtnW3YHCeUkRDG...|   0|2015-08-07 02:29:16|    0|_ZeMknuYdlQcUqng_...|  5.0|Amazingly amazing...|     2|yfFzsLmaWF2d4Sr0U...|
|SZU9c8V2GuREDN5Kg...|   0|2016-05-31 02:14:54|    0|4zopEEPqfwm-c_FNp...|  5.0|We were a bit wea...|     0|JYYYKt6TdVA4ng9lL...|
|EpREWeEpmR8f1qLHz...|   0|2011-11-30 06:58:36|    0|-up4mW6WdqzGrRh7t...|  5.0|After living in t...|     0|xbybLiQockAzC4xAl...|
|5Ce3lZksYVkCbrihq...|   0|2014-07-25 17:56:26|    0|ymhbOMW63B_vGaRFR...|  5.0|I just sta

In [43]:
df_rdd.select(df_rdd.business_id, df_rdd.text.alias("review")).show(5)

+--------------------+--------------------+
|         business_id|              review|
+--------------------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|If you decide to ...|
|7ATYjTIgM3jUlt4UM...|I've taken a lot ...|
|YjUWPpI6HXG530lwP...|Family diner. Had...|
|kxX2SOes4o-D3ZQBk...|Wow!  Yummy, diff...|
|e4Vwtrqf-wpJfwesg...|Cute interior and...|
+--------------------+--------------------+
only showing top 5 rows



---

### Можно взять название колонок через строковую переменную. Это может быть удобно тогда, когда название столбца очень сложное!

In [95]:
import pyspark.sql.functions as f
df_rdd.select(df_rdd.business_id, f.col("text").alias("reviews")).show(5)

+--------------------+--------------------+
|         business_id|             reviews|
+--------------------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|If you decide to ...|
|7ATYjTIgM3jUlt4UM...|I've taken a lot ...|
|YjUWPpI6HXG530lwP...|Family diner. Had...|
|kxX2SOes4o-D3ZQBk...|Wow!  Yummy, diff...|
|e4Vwtrqf-wpJfwesg...|Cute interior and...|
+--------------------+--------------------+
only showing top 5 rows



In [54]:
# Если имя столбца хранится в переменной
column_name = "text"
df.select(df.business_id, f.col(column_name).alias("review")).show(10)

+--------------------+--------------------+
|         business_id|              review|
+--------------------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|If you decide to ...|
|7ATYjTIgM3jUlt4UM...|I've taken a lot ...|
|YjUWPpI6HXG530lwP...|Family diner. Had...|
|kxX2SOes4o-D3ZQBk...|Wow!  Yummy, diff...|
|e4Vwtrqf-wpJfwesg...|Cute interior and...|
|04UD14gamNjLY0IDY...|I am a long term ...|
|gmjsEdUsKpj9Xxu6p...|Loved this tour! ...|
|LHSTtnW3YHCeUkRDG...|Amazingly amazing...|
|B5XSoSG3SfvQGtKEG...|This easter inste...|
|gebiRewfieSdtt17P...|Had a party of 6 ...|
+--------------------+--------------------+
only showing top 10 rows



In [None]:
# Если в имени столбца есть пробелы или специальные символы
#df.select(f.col("business id"), f.col("review text")).show()

In [None]:
# Удобно для последовательных преобразований
# df.select(
#     f.col("stars").cast("integer"),
#     f.col("text").substr(1, 100).alias("preview"),
#     f.upper(f.col("business_id")).alias("business_id_upper")
# ).show()

In [None]:
# Когда пишешь функцию, которая работает с любыми столбцами
# def analyze_column(df, column_name):
#     return df.select(
#         f.col(column_name),
#         f.length(f.col(column_name)).alias("length")
#     )

# analyze_column(df, "text").show()

In [None]:
# Если в DataFrame есть столбец с именем, совпадающим с методом DataFrame
# df.select(
#     f.col("count"),  # столбец "count", а не метод .count()
#     f.col("sum")     # столбец "sum", а не метод .sum()
# ).show()

___

In [47]:
df_rdd[df_rdd.business_id, "text"].show(5)

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|If you decide to ...|
|7ATYjTIgM3jUlt4UM...|I've taken a lot ...|
|YjUWPpI6HXG530lwP...|Family diner. Had...|
|kxX2SOes4o-D3ZQBk...|Wow!  Yummy, diff...|
|e4Vwtrqf-wpJfwesg...|Cute interior and...|
+--------------------+--------------------+
only showing top 5 rows



In [48]:
df_rdd["business_id", "text"].show(5)

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|If you decide to ...|
|7ATYjTIgM3jUlt4UM...|I've taken a lot ...|
|YjUWPpI6HXG530lwP...|Family diner. Had...|
|kxX2SOes4o-D3ZQBk...|Wow!  Yummy, diff...|
|e4Vwtrqf-wpJfwesg...|Cute interior and...|
+--------------------+--------------------+
only showing top 5 rows



---
### Полключим механизм SQL! Для этого нам необхолимо зарегистрировать таблицу

In [50]:
df_rdd.createOrReplaceTempView("df")

In [51]:
query_str = '''
SELECT COUNT(*) cnt, business_id
FROM df
GROUP BY business_id
'''

In [52]:
spark.sql(query_str).show(5)



+---+--------------------+
|cnt|         business_id|
+---+--------------------+
| 47|h_6ioAoKNLi01kPho...|
| 53|2OEL_uLdTg90PiFVh...|
|227|BnibjYoTYefJXQ_ZV...|
| 94|L5rH_ypwqJcBByVac...|
|508|P2XJbQZmf1zvWp9L_...|
+---+--------------------+
only showing top 5 rows



                                                                                

___
## *Обхединение таблиц в HIVE*

In [55]:
business = spark.read.json("/data/yelp/business")

25/12/08 16:10:05 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [60]:
business.show(2, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 address      | 1616 Chapala St, Ste 2                                                                                                                                                                                                                     
 attributes   | {null, null, null, null, null, null, null, null, null, null, null, True, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null} 
 business_id  | Pns2l4eNsfO8kk83dixA6A                                                                                                                                                                                                              

In [59]:
business.count()

150346

In [61]:
df_rdd.show(2, vertical=True, truncate=False)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 business_id | XQfwVwDr-v0ZS3_CbbE5Xw                                                                                                                 

In [62]:
%%time
business_review = df_rdd.join(business, on="business_id", how="inner")

CPU times: user 223 µs, sys: 4.08 ms, total: 4.3 ms
Wall time: 48.2 ms


#### Посмотрим на план запроса

In [63]:
business_review.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [business_id#1854, cool#1855L, date#1856, funny#1857L, review_id#1858, stars#1859, text#1860, useful#1861L, user_id#1862, address#2244, attributes#2245, categories#2247, city#2248, hours#2249, is_open#2250L, latitude#2251, longitude#2252, name#2253, postal_code#2254, review_count#2255L, stars#2256, state#2257]
   +- SortMergeJoin [business_id#1854], [business_id#2246], Inner
      :- Sort [business_id#1854 ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(business_id#1854, 200), ENSURE_REQUIREMENTS, [plan_id=468]
      :     +- Filter isnotnull(business_id#1854)
      :        +- Scan ExistingRDD[business_id#1854,cool#1855L,date#1856,funny#1857L,review_id#1858,stars#1859,text#1860,useful#1861L,user_id#1862]
      +- Sort [business_id#2246 ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(business_id#2246, 200), ENSURE_REQUIREMENTS, [plan_id=469]
            +- Filter isnotnull(business_id#2

In [64]:
business_review.show(2, vertical=True, truncate=False)



-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 business_id  | ---kPU91CF4Lq2-WlRu9Lw                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
 cool     

                                                                                

Посмотрим на количество партиций в RDD. Под капотом Spark SQL использует RDD

In [66]:
business_review.rdd.getNumPartitions() 

67

___
#### Супермегабыстрый способ с броадкастом

Поменяем способ JOIN на BroadcastHashJoin

Дроп столбца

In [67]:
business.drop("stars")

DataFrame[address: string, attributes: struct<AcceptsInsurance:string,AgesAllowed:string,Alcohol:string,Ambience:string,BYOB:string,BYOBCorkage:string,BestNights:string,BikeParking:string,BusinessAcceptsBitcoin:string,BusinessAcceptsCreditCards:string,BusinessParking:string,ByAppointmentOnly:string,Caters:string,CoatCheck:string,Corkage:string,DietaryRestrictions:string,DogsAllowed:string,DriveThru:string,GoodForDancing:string,GoodForKids:string,GoodForMeal:string,HairSpecializesIn:string,HappyHour:string,HasTV:string,Music:string,NoiseLevel:string,Open24Hours:string,OutdoorSeating:string,RestaurantsAttire:string,RestaurantsCounterService:string,RestaurantsDelivery:string,RestaurantsGoodForGroups:string,RestaurantsPriceRange2:string,RestaurantsReservations:string,RestaurantsTableService:string,RestaurantsTakeOut:string,Smoking:string,WheelchairAccessible:string,WiFi:string>, business_id: string, categories: string, city: string, hours: struct<Friday:string,Monday:string,Saturday:string

In [68]:
spark.sql("SET spark.sql.autoBroadcastJoinThreshold = 10")
business_review_new = df_rdd.join(f.broadcast(business.drop("stars")), on="business_id", how="inner")
business_review_new.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [business_id#1854, cool#1855L, date#1856, funny#1857L, review_id#1858, stars#1859, text#1860, useful#1861L, user_id#1862, address#2244, attributes#2245, categories#2247, city#2248, hours#2249, is_open#2250L, latitude#2251, longitude#2252, name#2253, postal_code#2254, review_count#2255L, state#2257]
   +- BroadcastHashJoin [business_id#1854], [business_id#2246], Inner, BuildRight, false
      :- Filter isnotnull(business_id#1854)
      :  +- Scan ExistingRDD[business_id#1854,cool#1855L,date#1856,funny#1857L,review_id#1858,stars#1859,text#1860,useful#1861L,user_id#1862]
      +- BroadcastExchange HashedRelationBroadcastMode(List(input[2, string, false]),false), [plan_id=702]
         +- Filter isnotnull(business_id#2246)
            +- FileScan json [address#2244,attributes#2245,business_id#2246,categories#2247,city#2248,hours#2249,is_open#2250L,latitude#2251,longitude#2252,name#2253,postal_code#2254,review_count#2255L,st

In [69]:
business_review_new.rdd.getNumPartitions() 

20

In [70]:
business_review_new.show(2, vertical=True, truncate=False)

[Stage 50:>                                                         (0 + 1) / 1]

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 business_id  | XQfwVwDr-v0ZS3_CbbE5Xw                                                                                                               

                                                                                

#### Посчитаем статистики

In [71]:
business_review_new.groupby("business_id").agg(f.avg("stars")).show(5)



+--------------------+------------------+
|         business_id|        avg(stars)|
+--------------------+------------------+
|WKMJwqnfZKsAae75R...|             3.825|
|sk2lZI4zmuGAccd3D...| 4.757575757575758|
|zJErbOQMKX-MwHs_u...|2.9279279279279278|
|NQhyMw8SOU1HB-V9X...|              2.62|
|neL2xrin-uMJl5ABK...| 3.611111111111111|
+--------------------+------------------+
only showing top 5 rows



                                                                                

Посчитаем количество review по разным регионам

In [73]:
business_review_new.groupby('state').count().distinct().show(5)



+-----+------+
|state| count|
+-----+------+
|   AZ|431708|
|   LA|761673|
|   NJ|260897|
|   NV|430678|
|   ID|157572|
+-----+------+
only showing top 5 rows



                                                                                

In [75]:
result = business_review_new.groupby('city').count().distinct()

In [76]:
result.show(10)



+------------------+-----+
|              city|count|
+------------------+-----+
|      Harleysville| 2640|
|       Merion Park|    7|
|        Westampton|  822|
|       Springfield|11686|
|         Frontenac| 1560|
|       Marcus Hook|  500|
|     Pleasant View|  822|
|           Truckee|  198|
|   King Of Prussia| 8470|
|Indian Rocks Beach|10639|
+------------------+-----+
only showing top 10 rows



                                                                                

Запишем результаты в таблицу

In [82]:
result.write.csv("business_review_counts.tsv", sep='\t', mode='overwrite')

                                                                                

Посмотрим содержимое записей

In [88]:
df_check = spark.read.csv("business_review_counts.tsv", sep='\t')
df_check.show(5)

+------------+-----+
|         _c0|  _c1|
+------------+-----+
|Harleysville| 2640|
| Merion Park|    7|
|  Westampton|  822|
| Springfield|11686|
|   Frontenac| 1560|
+------------+-----+
only showing top 5 rows



---
### Оконные функции

In [None]:
# Приобразование запроса вида:
SELECT business_id, user_id, useful
LEAD(useful) OVER (PARTITION BY business_id ORDER BY useful DESC),
RANK() OVER (PARTITION BY business_id ORDER BY useful DESC)
LIMIT 10;

In [103]:
from pyspark.sql import Window

# Создаем оконную спецификацию один раз
window_spec = Window.partitionBy("business_id").orderBy(f.col("useful").desc())

df.select(
    "business_id", 
    "user_id", 
    "useful",
    f.rank().over(window_spec).alias("rank"),  # Ранжирование
    f.lead("useful").over(window_spec).alias("next_useful")  # Следующее значение
).show(5)

[Stage 82:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------+----+-----------+
|         business_id|             user_id|useful|rank|next_useful|
+--------------------+--------------------+------+----+-----------+
|---kPU91CF4Lq2-Wl...|i48cHEyRBl5g9_npY...|     4|   1|          3|
|---kPU91CF4Lq2-Wl...|Q-ia5eY9smWBTwYOZ...|     3|   2|          2|
|---kPU91CF4Lq2-Wl...|qrCkKrEwQ-q9m1iWS...|     2|   3|          1|
|---kPU91CF4Lq2-Wl...|V8oYXtc0hMuYzG5Hf...|     1|   4|          1|
|---kPU91CF4Lq2-Wl...|UmQDlaIjLiPBZ7M6U...|     1|   4|          1|
+--------------------+--------------------+------+----+-----------+
only showing top 5 rows



                                                                                

In [104]:
# С использованием where
df.select(
    "business_id", 
    "user_id", 
    "useful",
    f.rank().over(window_spec).alias("rank"),  # Ранжирование
    f.lead("useful").over(window_spec).alias("next_useful")  # Следующее значение
).where("rank <= 3").show(5)

[Stage 85:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------+----+-----------+
|         business_id|             user_id|useful|rank|next_useful|
+--------------------+--------------------+------+----+-----------+
|---kPU91CF4Lq2-Wl...|i48cHEyRBl5g9_npY...|     4|   1|          3|
|---kPU91CF4Lq2-Wl...|Q-ia5eY9smWBTwYOZ...|     3|   2|          2|
|---kPU91CF4Lq2-Wl...|qrCkKrEwQ-q9m1iWS...|     2|   3|          1|
|--9osgUCSDUWUkoTL...|g2eQ3vyzupM3WdtXl...|     6|   1|          3|
|--9osgUCSDUWUkoTL...|iZPO_Rd5BU-Y72KKi...|     3|   2|          2|
+--------------------+--------------------+------+----+-----------+
only showing top 5 rows



                                                                                

### Итоги
1. Spark SQL использует обертку в виде SparkSession
2. Spark автоматически подхватывает схему в случае JSON, в случае чтения CSV-данных необходимо указывать схему данных.
3. Broadcast — аналог Distributed Cache.
4. Можно писать запросы в SQL в Spark
5. Можно оптимизировать Join через BroadcastHashJoin