# Star Wars - Data Quality Project

In [1]:
!pip install pyspark;




In [1]:
import pyspark

### Know your machine resources

In [2]:
# CPU Cores
!sysctl -n hw.logicalcpu

# Memory in Bytes
!sysctl -n hw.memsize

8
17179869184


In [3]:
# Memory in GB
17179869184/(1024**3)

16.0

In [4]:
MAX_MEMORY = '4g'
MAX_MEMORY_OVERHEAD = '512m'
MAX_DRIVER_MEMORY = '1g'

### Setup Config

In [12]:
conf = pyspark.SparkConf().setMaster("local[2]") \
        .set('spark.executor.memory', MAX_MEMORY) \
        .set('spark.executor.memoryOverhead', MAX_MEMORY_OVERHEAD) \
        .set('spark.driver.memory', MAX_DRIVER_MEMORY) \
        .set("spark.driver.extraJavaOptions", "-XX:ReservedCodeCacheSize=256m") # code cache stores compiled code, and it can be full some times
        # .set('spark.executor.heartbeatInterval', 10000) \
        # .set('spark.network.timeout', 10000) \
        # .set('spark.core.connection.ack.wait.timeout', '3600') \

In [13]:
from pyspark.sql import SparkSession

### Initialize Spark

In [14]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("StarWars - Data Quality Project") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

In [15]:
spark = init_spark()

In [16]:
df = spark.read.csv('star_wars_reviews.csv',header=True)

In [19]:
df.show(5)

+---------+----------------+--------------+--------------------+--------------------+-----------------+----------+------------+
|review_id|       fav_heroe|   fav_villain|            fav_film|      fav_soundtrack|    fav_spaceship|fav_planet|   fav_robot|
+---------+----------------+--------------+--------------------+--------------------+-----------------+----------+------------+
|        0|Anakin Skywalker|    Darth Maul|Episode IV - A Ne...|   Accross the Stars|Naboo Starfighter|  Tatooine|       R2-D2|
|        1|Anakin Skywalker|    Darth Maul|Episode IV - A Ne...|     The Throne Room|Naboo Starfighter|  Tatooine|Battle Droid|
|        2|  Luke Skywalker|   Count Dooku|Episode V - The E...|Star Wars (Main T...|Millennium Falcon|     Endor|       R2-D2|
|        3|Anakin Skywalker|Wilhuff Tarkin|Episode VI - Retu...|     The Throne Room|Millennium Falcon|  Tatooine|       R2-D2|
|        4|            Yoda|     Palpatine|Episode IV - A Ne...|Star Wars (Main T...|Millennium Falcon| 

### Dataset Overview

In [18]:
df.describe()

DataFrame[summary: string, review_id: string, fav_heroe: string, fav_villain: string, fav_film: string, fav_soundtrack: string, fav_spaceship: string, fav_planet: string, fav_robot: string]

In [20]:
df.rdd.getNumPartitions()


1

In [21]:
df = df.repartition(4)

In [22]:
df.rdd.getNumPartitions()

CodeCache: size=131072Kb used=28157Kb max_used=28159Kb free=102914Kb
 bounds [0x000000010a1f8000, 0x000000010bda8000, 0x00000001121f8000]
 total_blobs=11111 nmethods=10131 adapters=892
 compilation: disabled (not enough contiguous free space left)




4

In [25]:
def print_partition_rows(partition):
    print("Partition Starts ------------")
    c = 0
    for row in partition:
        c+=1
        print(row)
        if c==4:
            break

df.foreachPartition(print_partition_rows)


Partition Starts ------------
Row(review_id='13475', fav_heroe='Obi-Wan Kenobi', fav_villain='Darth Maul', fav_film='Episode VI - Return of the Jedi', fav_soundtrack='The Throne Room', fav_spaceship='Millennium Falcon', fav_planet='Dagobah', fav_robot='C-3PO')
Row(review_id='35284', fav_heroe='Anakin Skywalker', fav_villain='Count Dooku', fav_film='Episode V - The Empire Strikes Back', fav_soundtrack='Star Wars (Main Theme)', fav_spaceship='Millennium Falcon', fav_planet='Endor', fav_robot='R2-D2')
Row(review_id='11702', fav_heroe='Luke Skywalker', fav_villain='Wilhuff Tarkin', fav_film='Episode II - Attack of the Clones', fav_soundtrack='Star Wars (Main Theme)', fav_spaceship='Naboo Starfighter', fav_planet='Naboo', fav_robot='C-3PO')
Row(review_id='18327', fav_heroe='Chewbacca', fav_villain='Count Dooku', fav_film='Episode IV - A New Hope', fav_soundtrack='Star Wars (Main Theme)', fav_spaceship='Naboo Starfighter', fav_planet='Tatooine', fav_robot='R2-D2')
Partition Starts ----------

In [26]:
import time

start = time.time()
df.groupBy("fav_heroe").count().collect()
print("Time taken:", time.time() - start)


Time taken: 0.5376410484313965


In [27]:
df = df.coalesce(1)

In [28]:
import time

start = time.time()
df.groupBy("fav_heroe").count().collect()
print("Time taken:", time.time() - start)


Time taken: 0.18422389030456543


In [29]:
df = df.repartition(4)

In [30]:
import time

start = time.time()
df.groupBy("fav_heroe").count().collect()
print("Time taken:", time.time() - start)


Time taken: 0.17882966995239258


In [31]:
df.count()

36927