# Agenda - We have google playstore daatset containing information of differents apps installed, rating and versions and other details. We are going to do analysis and find out answers for below questions :
1. find out top 10 reviews given to the apps
2. Top 10 installed apps and distribution of type free or paid.
3. Category wise distribution of installed apps
4. Top paid apps
5. Top rated apps.

In [56]:
import os
import sys
from pyspark.sql import SparkSession
from lib import Utils, DataLoader
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.types import StringType, IntegerType, DoubleType


os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [57]:
spark = Utils.get_spark_session()

In [58]:
google_play_store_df = DataLoader.read_data(spark, "data/googleplaystore.csv")

In [59]:
google_play_store_df.count()

10841

In [60]:
google_play_store_df.show(1)

+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|      Genres|   Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M| 10,000+|Free|    0|      Everyone|Art & Design|January 7, 2018|      1.0.0|4.0.3 and up|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
only showing top 1 row



In [61]:
# check schema
google_play_store_df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [62]:
# data cleaning
google_play_store_df= google_play_store_df.drop("size", "Content Rating", "Last Updated", "Current Ver","Android Ver")

In [63]:
# after cleaning the data or removing the columns which are not required
google_play_store_df.show(2)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|500,000+|Free|    0|Art & Design;Pret...|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
only showing top 2 rows



In [64]:
google_play_store_df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)



In [65]:
# change the data type of columns which are incorrect
google_play_store_df = google_play_store_df.withColumn("Reviews", google_play_store_df['Reviews'].cast(IntegerType())) 
#google_play_store_df = google_play_store_df.withColumn("Installs", regexp_replace(google_play_store_df["Installs"], "[^0-9]", ""))
google_play_store_df = google_play_store_df.withColumn("Installs", regexp_replace(col("Installs"), "[^0-9]", ""))
google_play_store_df = google_play_store_df.withColumn("Installs", google_play_store_df['Installs'].cast(IntegerType())) 
#google_play_store_df = google_play_store_df.withColumn("Price", regexp_replace(google_play_store_df["Price"], "[$]", ""))
google_play_store_df = google_play_store_df.withColumn("Price", regexp_replace(col("Price"), "[$]", ""))
google_play_store_df = google_play_store_df.withColumn("Price", google_play_store_df['Price'].cast(IntegerType())) 
google_play_store_df = google_play_store_df.withColumn("Rating", google_play_store_df['Rating'].cast(DoubleType())) 


In [66]:
google_play_store_df.printSchema()


root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Genres: string (nullable = true)



In [67]:
# after changing the data type
google_play_store_df.show(2)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
only showing top 2 rows



In [68]:
# Now data is cleaned and is ready for analysis
google_play_store_df.createOrReplaceTempView("apps")

In [69]:
sql_df = spark.sql("select * from apps")
sql_df.show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815| 1000000

In [70]:
sql_df.count()

10841

In [71]:
# 1. find out top 10 reviews given to the apps
sql_df = spark.sql(" select app, sum(reviews) from apps group by 1 order by 2 desc")
sql_df.show()

+--------------------+------------+
|                 app|sum(reviews)|
+--------------------+------------+
|           Instagram|   266241989|
|  WhatsApp Messenger|   207348304|
|      Clash of Clans|   179558781|
|Messenger – Text ...|   169932272|
|      Subway Surfers|   166331958|
|    Candy Crush Saga|   156993136|
|            Facebook|   156286514|
|         8 Ball Pool|    99386198|
|        Clash Royale|    92530298|
|            Snapchat|    68045010|
|     Viber Messenger|    56675481|
|UC Browser - Fast...|    53140694|
|             YouTube|    51278853|
|        Temple Run 2|    48710930|
|Sniper 3D Gun Sho...|    46022233|
|      My Talking Tom|    44668928|
|Duolingo: Learn L...|    44047832|
|       Google Photos|    43423827|
|Clean Master- Spa...|    42916526|
|                 Pou|    41939801|
+--------------------+------------+
only showing top 20 rows



In [72]:
# 2. Top 10 installed apps and distribution of type free or paid.
sql_df = spark.sql(" select app, type , sum(installs) from apps group by 1,2 order by 3 desc")
sql_df.show()


+--------------------+----+-------------+
|                 app|type|sum(installs)|
+--------------------+----+-------------+
|      Subway Surfers|Free|   6000000000|
|           Instagram|Free|   4000000000|
|        Google Drive|Free|   4000000000|
|            Hangouts|Free|   4000000000|
|       Google Photos|Free|   4000000000|
|         Google News|Free|   4000000000|
|    Candy Crush Saga|Free|   3500000000|
|  WhatsApp Messenger|Free|   3000000000|
|               Gmail|Free|   3000000000|
|        Temple Run 2|Free|   3000000000|
|Skype - free IM &...|Free|   3000000000|
|Google Chrome: Fa...|Free|   3000000000|
|Messenger – Text ...|Free|   3000000000|
|Maps - Navigate &...|Free|   3000000000|
|     Viber Messenger|Free|   2500000000|
|   Google Play Games|Free|   2000000000|
|            Facebook|Free|   2000000000|
|            Snapchat|Free|   2000000000|
|imo free video ca...|Free|   2000000000|
|  Google Street View|Free|   2000000000|
+--------------------+----+-------

In [73]:
# 3. Category wise distribution of installed apps
sql_df = spark.sql(" select category, sum(installs) from apps group by 1 order by 2 desc")
sql_df.show()

+-------------------+-------------+
|           category|sum(installs)|
+-------------------+-------------+
|               GAME|  35086024415|
|      COMMUNICATION|  32647276251|
|       PRODUCTIVITY|  14176091369|
|             SOCIAL|  14069867902|
|              TOOLS|  11452771915|
|             FAMILY|  10258263505|
|        PHOTOGRAPHY|  10088247655|
| NEWS_AND_MAGAZINES|   7496317760|
|   TRAVEL_AND_LOCAL|   6868887146|
|      VIDEO_PLAYERS|   6222002720|
|           SHOPPING|   3247848785|
|      ENTERTAINMENT|   2869160000|
|    PERSONALIZATION|   2325494782|
|BOOKS_AND_REFERENCE|   1921469576|
|             SPORTS|   1751174498|
| HEALTH_AND_FITNESS|   1582072512|
|           BUSINESS|   1001914865|
|            FINANCE|    876648734|
|          EDUCATION|    871452000|
|MAPS_AND_NAVIGATION|    719281890|
+-------------------+-------------+
only showing top 20 rows



In [74]:
# 4. Top paid apps
sql_df = spark.sql(" select app, sum(price)  from apps where type = 'Paid' group by 1 order by 2 desc")
sql_df.show()

+--------------------+----------+
|                 app|sum(price)|
+--------------------+----------+
|I'm Rich - Trump ...|       400|
|   I Am Rich Premium|       399|
|  I AM RICH PRO PLUS|       399|
|I'm Rich/Eu sou R...|       399|
|      I am Rich Plus|       399|
|most expensive ap...|       399|
|       I Am Rich Pro|       399|
|  I am rich(premium)|       399|
|           I am Rich|       399|
|          I am Rich!|       399|
|         💎 I'm rich|       399|
|I am rich (Most e...|       399|
|           I am rich|       399|
|         Eu Sou Rico|       394|
|           I Am Rich|       389|
| I am extremely Rich|       379|
|       I am rich VIP|       299|
|        EP Cook Book|       200|
|Vargo Anesthesia ...|       158|
|       cronometra-br|       154|
+--------------------+----------+
only showing top 20 rows



In [78]:
# 5. Top rated apps.
sql_df = spark.sql("select app, rating from apps")
sql_df.show()

+--------------------+------+
|                 app|rating|
+--------------------+------+
|Photo Editor & Ca...|   4.1|
| Coloring book moana|   3.9|
|U Launcher Lite –...|   4.7|
|Sketch - Draw & P...|   4.5|
|Pixel Draw - Numb...|   4.3|
|Paper flowers ins...|   4.4|
|Smoke Effect Phot...|   3.8|
|    Infinite Painter|   4.1|
|Garden Coloring Book|   4.4|
|Kids Paint Free -...|   4.7|
|Text on Photo - F...|   4.4|
|Name Art Photo Ed...|   4.4|
|Tattoo Name On My...|   4.2|
|Mandala Coloring ...|   4.6|
|3D Color Pixel by...|   4.4|
|Learn To Draw Kaw...|   3.2|
|Photo Designer - ...|   4.7|
|350 Diy Room Deco...|   4.5|
|FlipaClip - Carto...|   4.3|
|        ibis Paint X|   4.6|
+--------------------+------+
only showing top 20 rows

