In [87]:
import sys

from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
spark = SparkSession.builder.appName('ikhlebushkin_spark_df_task1').master('yarn').getOrCreate()

In [175]:
business = spark.read.format('json').load('/data/yelp/business')
reviews = spark.read.format('json').load('/data/yelp/review').select(['business_id', 'stars'])

                                                                                

In [177]:
negative_counted = (reviews
                    .where('stars < 3')
                    .groupBy('business_id')
                    .agg(f.count('stars').alias('stars_count'))).join(business,
                                                                      'business_id',
                                                                      'inner')
    

In [178]:
w = Window.orderBy(negative_counted.stars_count.desc()).partitionBy('city')
negative_tops = (negative_counted.withColumn('Rank',
                                             f.rank().over(w))
                                 .where('Rank <= 10'))

In [100]:
for row in negative_tops.collect():
    print(f'{row.business_id}\t{row.city}\t{row.stars_count}')

[Stage 35:>                                                         (0 + 1) / 1]

JmzNw0WCPmZPZdq5nx9brg	Abington	10
C2vKa-eZFVBrfFGYXxvXFw	Abington	8
F-eHPbdh9bl8aeYDRws4BQ	Abington	7
ETZK5PuCsVfQ7ElXckDTPw	Abington	5
5N-93oMmm0MUt8HvyOot5A	Abington	4
TeVMFL7ZmZy8145Qquc2Tg	Abington	3
Pw77mNz6cso9quMp2NwaiA	Abington	3
nXx6nOAqC0DqVCUObuUsMA	Abington	2
zXnFTbBeqWP8A3YvmFme-Q	Abington	2
jUyoND2GoaRRVS_t_HN6SQ	Abington	2
bFln2Z17KfFIcEz4TRaHjw	Abington Township	2
k0hlBqXX-Bt0vf1op7Jr1w	Affton	2
2jw__SxySiZcwsRlyyQiGQ	Affton	1
o-mPNuDkMsO7uxOZ2H8w9w	Affton	1
ugeb5xNeePZrpirbrzWDWg	Affton	1
y3AWxkmUws6cS_BvQ5Ryiw	Affton	1
tjvuAPVe8buQCDvtOsy3CA	Affton	1
Mt7WCoJzgb3rQqPie5nP7A	Affton	1
k2t4FhKQc42DF2_PlKlynQ	Affton	1
fsXxYOQ80doP_QeM_ufgbA	Affton	1
mpf3x-BjTdTEA3yCZrAYPw	Affton	1
PTQNzk4yOzXb95oWan8hgw	Afton	1
auBRFCUG20A_-4Mxot0dgQ	Aldan	1
Z7L0c2jCp5bgUHk2_povfA	Aldan	1
8RQkvk7ktoUwWn7K7lppNA	Aldan	1
oMhG3Ao8BohT7j6D95sDhA	Aldan	1
nChGjLmmNEOjPOWNSOKoHA	Alloway	1
1WRsPhkW7-KZh_M10iVzDw	Almonesson	1
xP5o6PVU3tWOixLJTIdVVA	Alton	5
ljxNT9p0y7YMPx0fcNBGig	Alton	3
WQYiPwm4iH

                                                                                

In [None]:
negative_tops.write.mode('overwrite').csv('spark_df_task1', sep='\t')

In [180]:
spark.stop()