In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql import types as T

spark = SparkSession.builder.master("local[*]").appName('ex5_google_apps').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/23 18:01:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
#Define an array of Row objects that map age limits to content ratings.
age_limit_arr = [Row(age_limit=18, Content_Rating='Adults only 18+'),
Row(age_limit=17, Content_Rating='Mature 17+'),
Row(age_limit=12, Content_Rating='Teen'),
Row(age_limit=10, Content_Rating='Everyone 10+'),
Row(age_limit=0, Content_Rating='Everyone')]

print(age_limit_arr)

[Row(age_limit=18, Content_Rating='Adults only 18+'), Row(age_limit=17, Content_Rating='Mature 17+'), Row(age_limit=12, Content_Rating='Teen'), Row(age_limit=10, Content_Rating='Everyone 10+'), Row(age_limit=0, Content_Rating='Everyone')]


+---------+---------------+
|age_limit| Content Rating|
+---------+---------------+
|       18|Adults only 18+|
|       17|     Mature 17+|
|       12|           Teen|
|       10|   Everyone 10+|
|        0|       Everyone|
+---------+---------------+



In [3]:
#Read the Google Play Store data CSV into a DataFrame .
google_apps_df = spark.read.csv('s3a://spark/data/raw/google_apps/', header=True)
google_apps_df.printSchema()
google_apps_df.show

24/09/23 18:02:16 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



<bound method DataFrame.show of DataFrame[App: string, Category: string, Rating: string, Reviews: string, Size: string, Installs: string, Type: string, Price: string, Content Rating: string, Genres: string, Last Updated: string, Current Ver: string, Android Ver: string]>

In [4]:
#Convert the age limit mapping array to a DataFrame.
age_limit_df = spark.createDataFrame(age_limit_arr).withColumnRenamed('Content_Rating', 'Content Rating')
age_limit_df.show()



+---------+---------------+
|age_limit| Content Rating|
+---------+---------------+
|       18|Adults only 18+|
|       17|     Mature 17+|
|       12|           Teen|
|       10|   Everyone 10+|
|        0|       Everyone|
+---------+---------------+



                                                                                

In [5]:
google_apps_df.show(5)

+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|   Installs|Type|Price|Content Rating|              Genres|    Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+------------------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|    10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|             1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|   500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|             2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M| 5,000,000+|Free|    0|      Everyone|        Art & Design|  August 1, 2018|             1.2.4|4.0.3 and up|
|Ske

In [6]:
# Join the age_limit_df with the main DataFrame based on the 'Content Rating' column .
joined_df = google_apps_df.join(F.broadcast(age_limit_df), ['Content Rating'])

In [13]:
# Data Transformation & Cleaning:
# Relevant columns are selected and renamed.
# Non-numeric characters are removed from the 'Installs' column.
# The 'Price' column is cast to DoubleType.
# Missing ratings are replaced with -1.


selected_df = joined_df \
. select(F.col('App').alias('application_name'),
F.col('Category').alias('category') ,
F.col('Rating').alias('rating') ,
F.col('Reviews').cast(T.FloatType()).alias('reviews') ,
F.col('Size').alias('size'),
F.regexp_replace(F.col('Installs'), '[^0-9]', '').cast(T.DoubleType()).alias('num_of_installs'),
F.col('Price').cast(T.DoubleType()).alias('price') ,
F.col('age_limit') ,
F.col('Genres').alias('genres') ,
F.col('Current Ver').alias('version'))\
.fillna(-1, 'Rating')

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql import types as T
spark = SparkSession.builder.master("local").appName('ex5_google_apps').getOrCreate()
age_limit_arr = [Row(age_limit=18, Content_Rating='Adults only 18+'),
Row(age_limit=17, Content_Rating='Mature 17+'),
Row(age_limit=12, Content_Rating='Teen'),
Row(age_limit=10, Content_Rating='Everyone 10+'),
Row(age_limit=0, Content_Rating='Everyone')]
print(age_limit_arr)
google_apps_df = spark.read.csv('s3a://spark/data/raw/google_apps/', header=True)
google_apps_df.printSchema()
age_limit_df = spark.createDataFrame(age_limit_arr).withColumnRenamed('Content_Rating', 'Content Rating')
age_limit_df.show()
joined_df = google_apps_df.join(F.broadcast(age_limit_df), ['Content Rating'])
selected_df = joined_df \
.select(F.col('App').alias('application_name'),
F.col('Category').alias('category'),
F.col('Rating').alias('rating'),
F.col('Reviews').cast(T.FloatType()).alias('reviews'),
F.col('Size').alias('size'),
F.regexp_replace(F.col('Installs'), '[^0-9]', '').cast(T.DoubleType()).alias('num_of_installs'),
F.col('Price').cast(T.DoubleType()).alias('price'),
F.col('age_limit'),
F.col('Genres').alias('genres'),
F.col('Current Ver').alias('version')) \
.fillna(-1, 'Rating')
selected_df.show(6)
selected_df.printSchema()
selected_df.write.parquet('s3a://spark/data/source/google_apps', mode='overwrite')
spark.stop()