In [63]:
# PySpark Imports
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

# Regular Python imports
import os
import sys

In [64]:
# System paths
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Creating a Spark Session
spark = SparkSession.builder.appName('Playstore_Spark').getOrCreate()

### Create a DataFrame

In [65]:
df = spark.read.load('googleplaystore.csv', format='csv', header=True, sep=',', escape='"', inferSchema=True)

### Data cleaning

In [66]:
df = df.drop("size", "Content Rating", "Last Updated", "Android Ver", "Current Ver")

In [67]:
df = df.withColumn("Reviews", F.col("Reviews").cast(T.IntegerType()))\
    .withColumn("Installs",F.regexp_replace(F.col("Installs"), "[^0-9]",""))\
    .withColumn("Installs", F.col("Installs").cast(T.IntegerType()))\
    .withColumn("Price", F.regexp_replace(F.col("Price"), "[$]",""))\
    .withColumn("Price", F.round(F.col("Price").cast(T.FloatType()), 2))

In [68]:
df.createOrReplaceTempView("apps")

### Top reviews given to the apps

In [69]:
spark.sql("select App, sum(Reviews) as total_reviews from apps group by App order by total_reviews desc").show(10)

+--------------------+-------------+
|                 App|total_reviews|
+--------------------+-------------+
|           Instagram|    266241989|
|  WhatsApp Messenger|    207348304|
|      Clash of Clans|    179558781|
|Messenger – Text ...|    169932272|
|      Subway Surfers|    166331958|
|    Candy Crush Saga|    156993136|
|            Facebook|    156286514|
|         8 Ball Pool|     99386198|
|        Clash Royale|     92530298|
|            Snapchat|     68045010|
+--------------------+-------------+
only showing top 10 rows



### Top 10 installs per app

In [70]:
spark.sql("select App, Type, sum(Installs) as total_installs from apps group by App, Type order by total_installs desc").show()

+--------------------+----+--------------+
|                 App|Type|total_installs|
+--------------------+----+--------------+
|      Subway Surfers|Free|    6000000000|
|           Instagram|Free|    4000000000|
|        Google Drive|Free|    4000000000|
|            Hangouts|Free|    4000000000|
|       Google Photos|Free|    4000000000|
|         Google News|Free|    4000000000|
|    Candy Crush Saga|Free|    3500000000|
|  WhatsApp Messenger|Free|    3000000000|
|               Gmail|Free|    3000000000|
|        Temple Run 2|Free|    3000000000|
|Skype - free IM &...|Free|    3000000000|
|Google Chrome: Fa...|Free|    3000000000|
|Messenger – Text ...|Free|    3000000000|
|Maps - Navigate &...|Free|    3000000000|
|     Viber Messenger|Free|    2500000000|
|   Google Play Games|Free|    2000000000|
|            Facebook|Free|    2000000000|
|            Snapchat|Free|    2000000000|
|imo free video ca...|Free|    2000000000|
|  Google Street View|Free|    2000000000|
+----------

### Category wise distribution

In [71]:
spark.sql("""
    select Category, sum(Installs) as total_installs 
    from apps 
    group by Category
    order by total_installs desc"""
).show(10)

+------------------+--------------+
|          Category|total_installs|
+------------------+--------------+
|              GAME|   35086024415|
|     COMMUNICATION|   32647276251|
|      PRODUCTIVITY|   14176091369|
|            SOCIAL|   14069867902|
|             TOOLS|   11452771915|
|            FAMILY|   10258263505|
|       PHOTOGRAPHY|   10088247655|
|NEWS_AND_MAGAZINES|    7496317760|
|  TRAVEL_AND_LOCAL|    6868887146|
|     VIDEO_PLAYERS|    6222002720|
+------------------+--------------+
only showing top 10 rows



### Top paid apps

In [74]:
spark.sql("""
    select App, round(sum(Price),2) as price
    from apps
    where Type = 'Paid'
    group by App
    order by price desc"""
).show(10)

+--------------------+------+
|                 App| price|
+--------------------+------+
|I'm Rich - Trump ...| 400.0|
|most expensive ap...|399.99|
|           I am Rich|399.99|
|  I AM RICH PRO PLUS|399.99|
|  I am rich(premium)|399.99|
|      I am Rich Plus|399.99|
|I'm Rich/Eu sou R...|399.99|
|   I Am Rich Premium|399.99|
|       I Am Rich Pro|399.99|
|          I am Rich!|399.99|
+--------------------+------+
only showing top 10 rows

