In [1]:
import pyspark
import os
from itertools import chain

from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
os.environ["PYSPARK_PYTHON"]="python3"

sc = SparkContext(master="local[*]", appName= "readJSON")
spark = SparkSession \
    .builder \
    .appName("PySpark") \
    .getOrCreate()

In [2]:
df = spark.read.json("gp.jsonl")
rdd = spark.read.json("gp.jsonl").rdd

# Подсчет колличества приложений автора

In [3]:
%%time
result = rdd.map(lambda x: (x.author, x.author)).groupByKey().mapValues(lambda x: len(x)).sortBy(lambda x: x[1], ascending=False).collect()


CPU times: user 33.5 ms, sys: 13.7 ms, total: 47.3 ms
Wall time: 2.17 s


In [4]:
%%time
result = rdd.map(lambda x: (x.author, x.author)).groupByKey().mapValues(lambda x: len(x)).collect()
# без сортировки

CPU times: user 22 ms, sys: 436 µs, total: 22.4 ms
Wall time: 727 ms


In [5]:
%%time
result = df.groupBy(F.col('author')).count().sort('count', ascending=False).show()

+--------------------+-----+
|              author|count|
+--------------------+-----+
|         iniCall.com|   18|
|              ZT.art|   13|
|CrowdCompass by C...|   13|
|Subsplash Consulting|   11|
|      +HOME by Ateam|   10|
|Difference Games LLC|    9|
|      Big Fish Games|    9|
|            iConnect|    9|
|          MobilePlus|    8|
|         Games2Jolly|    8|
|           Dict.land|    8|
|        Magzter Inc.|    8|
|GO Keyboard Dev Team|    8|
|       Bede Products|    7|
| SK techx for themes|    7|
|Glosbe Parfieniuk...|    7|
|         MobileGames|    7|
|         eChurch App|    7|
|         Nobex Radio|    7|
|         Goodia Inc.|    6|
+--------------------+-----+
only showing top 20 rows

CPU times: user 6.14 ms, sys: 0 ns, total: 6.14 ms
Wall time: 2.48 s


In [6]:
%%time
# без сортировки
result = df.groupBy(F.col('author')).count().show()

+--------------------+-----+
|              author|count|
+--------------------+-----+
|           Peppercon|    1|
|        LightBringer|    1|
|           coulibaly|    1|
|       YDigitalMedia|    1|
|Association of Ch...|    1|
| Android Apps Market|    1|
|          Kraft&Werk|    1|
|         JBR Techies|    1|
|          97 Display|    2|
|Axiata Group Berh...|    1|
|          Foyo Games|    1|
|         Yippee Labs|    1|
|        Gleb Kolyada|    1|
|             bokonon|    1|
|    App iStudio, LLC|    1|
|        Join Visions|    1|
|       rev developer|    1|
|       Streamography|    1|
|Trickster Cards, ...|    1|
|PiXiL Internation...|    1|
+--------------------+-----+
only showing top 20 rows

CPU times: user 3.06 ms, sys: 336 µs, total: 3.4 ms
Wall time: 404 ms


# Распределение игр по жанрам

In [7]:
%%time
result = rdd.map(lambda x: (x.genre, x.genre)).groupByKey().mapValues(lambda x: len(x)).sortBy(lambda x: x[1], ascending=False).collect()


CPU times: user 27.9 ms, sys: 7.07 ms, total: 35 ms
Wall time: 1.17 s


In [8]:
%%time
result = rdd.map(lambda x: (x.genre, x.genre)).groupByKey().mapValues(lambda x: len(x)).collect()
# без сортировки

CPU times: user 14.8 ms, sys: 1.85 ms, total: 16.7 ms
Wall time: 682 ms


In [9]:
%%time
result = df.groupBy(F.col('genre')).count().sort('count', ascending=False).show()

+-----------------+-----+
|            genre|count|
+-----------------+-----+
|    Entertainment| 1640|
|        Education| 1559|
|  Personalisation| 1326|
|            Tools| 1274|
|        Lifestyle| 1230|
|    Music & Audio| 1078|
|Books & Reference| 1053|
|         Business|  956|
|           Puzzle|  832|
|           Casual|  734|
|           Arcade|  690|
|   Travel & Local|  604|
| Health & Fitness|  578|
|     Productivity|  547|
|      Photography|  517|
|           Sports|  483|
| News & Magazines|  461|
|    Communication|  372|
|           Social|  361|
|           Action|  347|
+-----------------+-----+
only showing top 20 rows

CPU times: user 3.33 ms, sys: 2.56 ms, total: 5.89 ms
Wall time: 819 ms


In [10]:
%%time
# без сортировки
result = df.groupBy(F.col('genre')).count().show()

+----------------+-----+
|           genre|count|
+----------------+-----+
|   Music & Audio| 1078|
|       Education| 1559|
|          Trivia|  153|
| Auto & Vehicles|   39|
|   Entertainment| 1640|
|       Adventure|  278|
|          Arcade|  690|
|          Sports|  483|
|  Travel & Local|  604|
|    Food & Drink|  121|
|    Role Playing|   61|
|         Finance|  284|
|          Racing|  146|
|           Tools| 1274|
|     Educational|  288|
|          Comics|   50|
|          Social|  361|
|Libraries & Demo|   69|
|        Shopping|  292|
|Health & Fitness|  578|
+----------------+-----+
only showing top 20 rows

CPU times: user 1.56 ms, sys: 2.24 ms, total: 3.8 ms
Wall time: 472 ms


# Количество ревью на каждый жанр

In [11]:
%%time
result = rdd.map(lambda x: (x.genre, int('0'+x.review_number.replace(',', '')))).groupByKey().mapValues(sum).sortBy(lambda x: x[1], ascending=False).collect()


CPU times: user 40 ms, sys: 5.62 ms, total: 45.6 ms
Wall time: 837 ms


In [12]:
%%time
# без сортировки
result = rdd.map(lambda x: (x.genre, int('0'+x.review_number.replace(',', '')))).groupByKey().mapValues(sum).collect()


CPU times: user 19.1 ms, sys: 1.67 ms, total: 20.8 ms
Wall time: 549 ms


In [13]:
%%time
result = df.select(F.col('genre'),('0'+F.regexp_replace('review_number', ',', '')).alias('review_number')).fillna(0).groupBy('genre').sum().sort('sum(review_number)', ascending=False)
result.show()

+--------------------+------------------+
|               genre|sum(review_number)|
+--------------------+------------------+
|               Tools|        1.042381E7|
|     Personalisation|         6050071.0|
|          Simulation|         2764567.0|
|       Entertainment|         2404774.0|
|   Books & Reference|          987578.0|
|              Casual|          885296.0|
|        Role Playing|          817249.0|
|           Lifestyle|          723174.0|
|              Sports|          709586.0|
|              Puzzle|          673517.0|
|              Action|          669793.0|
|              Arcade|          649696.0|
|            Strategy|          637511.0|
|    Health & Fitness|          474781.0|
|                Word|          436084.0|
|             Weather|          382491.0|
|Video Players & E...|          368956.0|
|           Education|          354611.0|
|       Music & Audio|          333168.0|
|             Finance|          329823.0|
+--------------------+------------

In [14]:
%%time
# без сортировки
result = df.select(F.col('genre'),('0'+F.regexp_replace('review_number', ',', '')).alias('review_number')).fillna(0).groupBy('genre').sum()
result.show()

+----------------+------------------+
|           genre|sum(review_number)|
+----------------+------------------+
|   Music & Audio|          333168.0|
|       Education|          354611.0|
|          Trivia|           25963.0|
| Auto & Vehicles|            2572.0|
|   Entertainment|         2404774.0|
|       Adventure|          219124.0|
|          Arcade|          649696.0|
|          Sports|          709586.0|
|  Travel & Local|           41345.0|
|    Food & Drink|           12604.0|
|    Role Playing|          817249.0|
|         Finance|          329823.0|
|          Racing|          267956.0|
|           Tools|        1.042381E7|
|     Educational|          140062.0|
|          Comics|           62308.0|
|          Social|          205132.0|
|Libraries & Demo|            2394.0|
|        Shopping|           66382.0|
|Health & Fitness|          474781.0|
+----------------+------------------+
only showing top 20 rows

CPU times: user 7.72 ms, sys: 2.4 ms, total: 10.1 ms
Wall time

## Средняя оценка жанра

In [15]:
%%time
result = rdd.map(lambda x: (x.genre, float("0" + x.rating_value))).groupByKey().mapValues(lambda x: sum(x)/len(x)).sortBy(lambda x: x[1], ascending=False).collect()

CPU times: user 42.5 ms, sys: 16.5 ms, total: 59 ms
Wall time: 993 ms


In [16]:
%%time
# без сортировки
result = rdd.map(lambda x: (x.genre, float("0" + x.rating_value))).groupByKey().mapValues(lambda x: sum(x)/len(x)).collect()

CPU times: user 9.59 ms, sys: 14.6 ms, total: 24.2 ms
Wall time: 643 ms


In [17]:
%%time
result = df.select(F.col('genre'),('0'+F.col('rating_value')).alias('rating_value')).fillna(0).groupBy('genre').agg(F.mean('rating_value')).sort('avg(rating_value)', ascending=False)
result.show()

+--------------------+------------------+
|               genre| avg(rating_value)|
+--------------------+------------------+
|              Dating| 4.022222222222222|
|                    |               3.8|
|              Comics|              3.71|
|            Strategy| 3.702597402597403|
|              Arcade| 3.654927536231884|
|              Action|3.6348703170028824|
|                Card| 3.620879120879121|
|Video Players & E...| 3.596482412060301|
|        Role Playing|3.5885245901639347|
|              Casino|3.5870967741935487|
|              Racing| 3.565068493150685|
|              Social| 3.558725761772853|
|               Music|3.5555555555555562|
|              Trivia|3.5503267973856203|
|   Maps & Navigation| 3.539153439153439|
|          Simulation|3.5203846153846152|
|     Auto & Vehicles| 3.512820512820512|
|              Casual| 3.480517711171662|
|              Puzzle|3.4775240384615387|
|             Finance| 3.464788732394366|
+--------------------+------------

In [18]:
%%time
#без сортировки
result = df.select(F.col('genre'),('0'+F.col('rating_value')).alias('rating_value')).fillna(0).groupBy('genre').agg(F.mean('rating_value'))
result.show()

+----------------+------------------+
|           genre| avg(rating_value)|
+----------------+------------------+
|   Music & Audio| 3.291372912801484|
|       Education| 3.289608723540731|
|          Trivia|3.5503267973856203|
| Auto & Vehicles| 3.512820512820512|
|   Entertainment| 3.238109756097561|
|       Adventure|3.3625899280575537|
|          Arcade| 3.654927536231884|
|          Sports|3.3360248447204968|
|  Travel & Local|3.1806291390728485|
|    Food & Drink|2.5132231404958674|
|    Role Playing|3.5885245901639347|
|         Finance| 3.464788732394366|
|          Racing| 3.565068493150685|
|           Tools|3.4037676609105176|
|     Educational| 3.278819444444445|
|          Comics|              3.71|
|          Social| 3.558725761772853|
|Libraries & Demo| 3.391304347826087|
|        Shopping| 3.344863013698631|
|Health & Fitness| 3.030622837370242|
+----------------+------------------+
only showing top 20 rows

CPU times: user 5.52 ms, sys: 894 µs, total: 6.42 ms
Wall time

# Отношение количества ревью к количеству скачиваний

In [19]:
downloads = {'': 1,
 '1 - 5': 5,
 '1,000 - 5,000': 5000,
 '1,000,000 - 5,000,000': 5000000,
 '10 - 50': 50,
 '10,000 - 50,000': 50000,
 '10,000,000 - 50,000,000': 50000000,
 '100 - 500': 500,
 '100,000 - 500,000': 500000,
 '100,000,000 - 500,000,000': 500000000,
 '5 - 10': 10,
 '5,000 - 10,000': 10000,
 '5,000,000 - 10,000,000': 10000000,
 '50 - 100': 100,
 '50,000 - 100,000': 100000,
 '500 - 1,000': 1000,
 '500,000 - 1,000,000': 1000000}

In [20]:
%%time
result = rdd.map(lambda x: (x.item_name, int('0'+x.review_number.replace(',', ''))/downloads[x.downloads])).sortBy(lambda x: x[1], ascending=False).collect()

CPU times: user 48.7 ms, sys: 9.88 ms, total: 58.6 ms
Wall time: 1.11 s


In [21]:
%%time
# без сортировки
result = rdd.map(lambda x: (x.item_name, int('0'+x.review_number.replace(',', ''))/downloads[x.downloads])).collect()

CPU times: user 13.3 ms, sys: 4.72 ms, total: 18 ms
Wall time: 349 ms


In [22]:
%%time
d_mapping = F.create_map([F.lit(x) for x in chain(*downloads.items())])
df.select(F.col('item_name'),(('0'+F.regexp_replace('review_number', ',', ''))/(d_mapping[df['downloads']])).alias('result')).sort('result', ascending=False).show()

+--------------------+------+
|           item_name|result|
+--------------------+------+
|Cheat for Clash R...|  21.0|
|              Soccer|   9.0|
|Battery Saver LED...|   6.0|
|      CashVideo Coin|   5.0|
|You-cam: Beauty m...|   3.0|
|     Modi Yuva Yojna|   3.0|
|        Dance Videos|   2.0|
|English Pronuncia...|   2.0|
| Drawing Dragon Ball|   2.0|
|          Fala Dercy|   2.0|
|Guide Super Mario...|   2.0|
|BIPOLAR SUNSHINE NEW|   2.0|
|Evergreen Hits Co...|   2.0|
|Free DragonSoul G...|   2.0|
|Coloring Book Of ...|   1.0|
|Free Novel - Crus...|   1.0|
|      DSLR HD Camera|   1.0|
|      The Diet Plans|   1.0|
|  Flash Call-SMS pro|   1.0|
|Best Design Roman...|   1.0|
+--------------------+------+
only showing top 20 rows

CPU times: user 19.4 ms, sys: 1.81 ms, total: 21.2 ms
Wall time: 313 ms


In [23]:
%%time
# без сортировки
d_mapping = F.create_map([F.lit(x) for x in chain(*downloads.items())])
df.select(F.col('item_name'),(('0'+F.regexp_replace('review_number', ',', ''))/(d_mapping[df['downloads']])).alias('result')).show()

+--------------------+---------+
|           item_name|   result|
+--------------------+---------+
|กลอน กำลังใจ by ค...|   0.0114|
|Cute Gift Dragon-...|  0.00466|
|Ultimate Screen B...|    0.002|
|    Connect Diamonds|  0.00222|
|  أذكار الصباح متجدد|    0.012|
|  Monster Hill Climb|     0.04|
|AutoRespons™ - Dr...|    0.014|
|               Botad|     null|
|Skin Care Tips in...|    0.001|
|SmartCircle Displ...|    0.001|
|   Islamic Ringtones|  9.42E-4|
|Relationship Tips...|     null|
|قرأن كامل عبد الب...|    0.004|
|              CAMERA|  0.00438|
|collage pip perfe...|     null|
|Wallpapers Dakar ...|    0.001|
|Fideo: Music Vide...|0.0065574|
|         94 To Score|     null|
| Soccer Online Stars|  0.00593|
|Telugu Immortal S...|     null|
+--------------------+---------+
only showing top 20 rows

CPU times: user 10.6 ms, sys: 8.64 ms, total: 19.3 ms
Wall time: 194 ms
