# Star Wars - Data Quality Project

In [1]:
!pip install pyspark;




In [2]:
import pyspark

### Know your machine resources

In [3]:
# CPU Cores
!sysctl -n hw.logicalcpu

# Memory in Bytes
!sysctl -n hw.memsize

8
17179869184


In [4]:
# Memory in GB
17179869184/(1024**3)

16.0

In [5]:
MAX_MEMORY = '4g'
MAX_MEMORY_OVERHEAD = '512m'
MAX_DRIVER_MEMORY = '1g'

### Setup Config

In [6]:
conf = pyspark.SparkConf().setMaster("local[2]") \
        .set('spark.executor.memory', MAX_MEMORY) \
        .set('spark.executor.memoryOverhead', MAX_MEMORY_OVERHEAD) \
        .set('spark.driver.memory', MAX_DRIVER_MEMORY) \
        .set("spark.driver.extraJavaOptions", "-XX:ReservedCodeCacheSize=256m") # code cache stores compiled code, and it can be full some times
        # .set('spark.executor.heartbeatInterval', 10000) \
        # .set('spark.network.timeout', 10000) \
        # .set('spark.core.connection.ack.wait.timeout', '3600') \

In [7]:
from pyspark.sql import SparkSession

### Initialize Spark

In [8]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("StarWars - Data Quality Project") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

In [9]:
spark = init_spark()

25/04/28 09:02:08 WARN Utils: Your hostname, Gowthams-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.4.246 instead (on interface en0)
25/04/28 09:02:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/28 09:02:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
df = spark.read.csv('star_wars_reviews.csv',header=True)

In [11]:
df.show(5)

+---------+----------------+--------------+--------------------+--------------------+-----------------+----------+------------+
|review_id|       fav_heroe|   fav_villain|            fav_film|      fav_soundtrack|    fav_spaceship|fav_planet|   fav_robot|
+---------+----------------+--------------+--------------------+--------------------+-----------------+----------+------------+
|        0|Anakin Skywalker|    Darth Maul|Episode IV - A Ne...|   Accross the Stars|Naboo Starfighter|  Tatooine|       R2-D2|
|        1|Anakin Skywalker|    Darth Maul|Episode IV - A Ne...|     The Throne Room|Naboo Starfighter|  Tatooine|Battle Droid|
|        2|  Luke Skywalker|   Count Dooku|Episode V - The E...|Star Wars (Main T...|Millennium Falcon|     Endor|       R2-D2|
|        3|Anakin Skywalker|Wilhuff Tarkin|Episode VI - Retu...|     The Throne Room|Millennium Falcon|  Tatooine|       R2-D2|
|        4|            Yoda|     Palpatine|Episode IV - A Ne...|Star Wars (Main T...|Millennium Falcon| 

## Data Profiling
Understand the structure, missing values, distinct favorites, anomalies.

In [23]:
# Structure
df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- fav_heroe: string (nullable = true)
 |-- fav_villain: string (nullable = true)
 |-- fav_film: string (nullable = true)
 |-- fav_soundtrack: string (nullable = true)
 |-- fav_spaceship: string (nullable = true)
 |-- fav_planet: string (nullable = true)
 |-- fav_robot: string (nullable = true)



In [24]:
# Basic Statistics
df.describe().show()

25/04/28 09:09:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+------------------+----------------+--------------+--------------------+-----------------+-------------+----------+------------+
|summary|         review_id|       fav_heroe|   fav_villain|            fav_film|   fav_soundtrack|fav_spaceship|fav_planet|   fav_robot|
+-------+------------------+----------------+--------------+--------------------+-----------------+-------------+----------+------------+
|  count|             36927|           36927|         36927|               36927|            36927|        36927|     36927|       36927|
|   mean|           18463.0|            NULL|          NULL|                NULL|             NULL|         NULL|      NULL|        NULL|
| stddev|10660.051031772795|            NULL|          NULL|                NULL|             NULL|         NULL|      NULL|        NULL|
|    min|                 0|Anakin Skywalker|   Count Dooku|Episode I - The P...|Accross the Stars|   Death Star|  Alderaan|Battle Droid|
|    max|              9999|      

                                                                                

In [26]:
# 3. Missing/Null Counts
from pyspark.sql import functions as F

df.select([F.count(F.when(F.col(column).isNull(), column)).alias(column) for column in df.columns]).show()

+---------+---------+-----------+--------+--------------+-------------+----------+---------+
|review_id|fav_heroe|fav_villain|fav_film|fav_soundtrack|fav_spaceship|fav_planet|fav_robot|
+---------+---------+-----------+--------+--------------+-------------+----------+---------+
|        0|        0|          0|       0|             0|            0|         0|        0|
+---------+---------+-----------+--------+--------------+-------------+----------+---------+



In [28]:
# Distinct Values in Fav Columns
for col_name in ['fav_heroe',
 'fav_villain',
 'fav_film',
 'fav_soundtrack',
 'fav_spaceship',
 'fav_planet',
 'fav_robot']:
    df.select(col_name).distinct().show()

+----------------+
|       fav_heroe|
+----------------+
|        Han Solo|
|            Leia|
|    Qui-Gon Jinn|
|   Jar Jar Binks|
|            Yoda|
|  Luke Skywalker|
|  Obi-Wan Kenobi|
|Anakin Skywalker|
|       Chewbacca|
+----------------+

+----------------+
|     fav_villain|
+----------------+
|  Wilhuff Tarkin|
|General Grievous|
|       Palpatine|
|     Count Dooku|
|      Darth Maul|
|     Darth Vader|
+----------------+

+--------------------+
|            fav_film|
+--------------------+
|Episode I - The P...|
|Episode III - Rev...|
|Episode VI - Retu...|
|Episode II - Atta...|
|Episode IV - A Ne...|
|Episode V - The E...|
+--------------------+

+--------------------+
|      fav_soundtrack|
+--------------------+
|Star Wars (Main T...|
|      Imperial March|
|     The Throne Room|
|   Accross the Stars|
|  Anakin vs. Obi-Wan|
+--------------------+

+-----------------+
|    fav_spaceship|
+-----------------+
|       Death Star|
|Millennium Falcon|
|      TIE Fighter|
|N

In [30]:
# 5. Count distinct values in all columns
df.agg(*[F.countDistinct(column).alias(column) for column in df.columns]).show()

+---------+---------+-----------+--------+--------------+-------------+----------+---------+
|review_id|fav_heroe|fav_villain|fav_film|fav_soundtrack|fav_spaceship|fav_planet|fav_robot|
+---------+---------+-----------+--------+--------------+-------------+----------+---------+
|    36927|        9|          6|       6|             5|            4|         5|        4|
+---------+---------+-----------+--------+--------------+-------------+----------+---------+



In [13]:
df.rdd.getNumPartitions()


1

In [27]:
df.columns

['review_id',
 'fav_heroe',
 'fav_villain',
 'fav_film',
 'fav_soundtrack',
 'fav_spaceship',
 'fav_planet',
 'fav_robot']

In [14]:
df = df.repartition(4)

In [15]:
df.rdd.getNumPartitions()

CodeCache: size=262144Kb used=22089Kb max_used=22105Kb free=240054Kb
 bounds [0x000000010b1f8000, 0x000000010c7b8000, 0x000000011b1f8000]
 total_blobs=7964 nmethods=7044 adapters=832
 compilation: disabled (not enough contiguous free space left)




4

In [16]:
def print_partition_rows(partition):
    print("Partition Starts ------------")
    c = 0
    for row in partition:
        c+=1
        print(row)
        if c==4:
            break

df.foreachPartition(print_partition_rows)


Partition Starts ------------
Row(review_id='19100', fav_heroe='Yoda', fav_villain='Palpatine', fav_film='Episode V - The Empire Strikes Back', fav_soundtrack='Star Wars (Main Theme)', fav_spaceship='Millennium Falcon', fav_planet='Tatooine', fav_robot='R2-D2')
Partition Starts ------------
Row(review_id='28930', fav_heroe='Anakin Skywalker', fav_villain='Darth Vader', fav_film='Episode VI - Return of the Jedi', fav_soundtrack='Imperial March', fav_spaceship='Millennium Falcon', fav_planet='Endor', fav_robot='R2-D2')
Row(review_id='25133', fav_heroe='Yoda', fav_villain='Wilhuff Tarkin', fav_film='Episode V - The Empire Strikes Back', fav_soundtrack='Star Wars (Main Theme)', fav_spaceship='Naboo Starfighter', fav_planet='Tatooine', fav_robot='C-3PO')
Row(review_id='17821', fav_heroe='Han Solo', fav_villain='Wilhuff Tarkin', fav_film='Episode VI - Return of the Jedi', fav_soundtrack='Star Wars (Main Theme)', fav_spaceship='Millennium Falcon', fav_planet='Tatooine', fav_robot='R2-D2')
Row

In [17]:
import time

start = time.time()
df.groupBy("fav_heroe").count().collect()
print("Time taken:", time.time() - start)


Time taken: 0.4172990322113037


In [18]:
df = df.coalesce(1)

In [19]:
import time

start = time.time()
df.groupBy("fav_heroe").count().collect()
print("Time taken:", time.time() - start)


Time taken: 0.1858351230621338


In [20]:
df = df.repartition(4)

In [21]:
import time

start = time.time()
df.groupBy("fav_heroe").count().collect()
print("Time taken:", time.time() - start)


Time taken: 0.173295259475708


In [22]:
df.count()

36927

# Data Quality Checks