# >> imports

In [2]:
import pyspark.sql.dataframe
import pandas as pd 
from pyspark.sql import SparkSession

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
# pandas.set_option("display.max_rows", None)

def to_panda(spark_df: pyspark.sql.dataframe.DataFrame) -> pd.DataFrame:
    """
    Convert a Spark DataFrame to a Pandas DataFrame.
    
    :param spark_df: the Spark DataFrame to convert 
    :return: the pandas DataFrame
    """
    return pd.DataFrame([row.asDict() for row in spark_df.collect()])

In [4]:
# name of the spark app 
THE_SPARK_APP_NAME = "spark_internals_practice"

spark = (
    SparkSession
    .builder
    .config("spark.sql.adaptive.enabled", "true")
    # .config("spark.sql.shuffle.partitions", 4)
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.broadcastTimeout", 12000)
    .appName(THE_SPARK_APP_NAME)
    .getOrCreate()
)

spark

In [5]:
web_ui_url = spark.sparkContext.uiWebUrl
print(f"Spark Web UI URL: {web_ui_url}")

Spark Web UI URL: http://juliuss-imac.lan:4040


In [13]:
rdd1 = spark.sparkContext.textFile("movies.json")
type(rdd1)

pyspark.rdd.RDD

In [14]:
rdd1.foreach(lambda x: print(x))

[
  {
    "_id": "Titanic_2001",
    "name": "Titanic",
    "year": 2001,
    "genre": [
      "romantic",
      "drama"
    ],
    "rating": 4,
    "description": "A seventeen-year-old aristocrat falls in love with a kind but poor artist aboard the luxurious, ill-fated R.M.S. Titanic"
  },
  {
    "_id": "Sunshine_2009",
    "name": "Sunshine",
    "year": 2004,
    "genre": [
      "sci-fi",
      "thriller"
    ],
    "rating": 3.5,
    "description": "A team of international astronauts are sent on a dangerous mission to reignite the dying Sun with a nuclear fission bomb in 2057."
  },
  {
    "_id": "Forest_2005",
    "name": "Forest Gump",
    "year": 2005,
    "genre": [
      "drama",
      "crime",
      "thriller"
    ],
    "rating": 3,
    "description": "A team of international astronauts are sent on a dangerous mission to reignite the dying Sun with a nuclear fission bomb in 2057."
  },
  {
    "_id": "Idiots_2004",
    "name": "3 Idiots",
    "year": 2004,
    "genre": [


In [16]:
# read movies.json file
movies_df = spark.read.option('multiLine', True).json('movies.json')
movies_df.show()

+--------------+--------------------+--------------------+-------------------+------+----+
|           _id|         description|               genre|               name|rating|year|
+--------------+--------------------+--------------------+-------------------+------+----+
|  Titanic_2001|A seventeen-year-...|   [romantic, drama]|            Titanic|   4.0|2001|
| Sunshine_2009|A team of interna...|  [sci-fi, thriller]|           Sunshine|   3.5|2004|
|   Forest_2005|A team of interna...|[drama, crime, th...|        Forest Gump|   3.0|2005|
|   Idiots_2004|Two friends are s...|[drama, comedy, r...|           3 Idiots|   4.5|2004|
|Inception_2006|A thief who steal...|[action, adventur...|          Inception|   4.0|2001|
|     Wolf_2009|Based on the true...|     [action, drama]|Wolf of wall street|   3.5|2001|
+--------------+--------------------+--------------------+-------------------+------+----+


In [17]:
df2 = movies_df.filter('year>2001')

In [18]:
df2.show()

+-------------+--------------------+--------------------+-----------+------+----+
|          _id|         description|               genre|       name|rating|year|
+-------------+--------------------+--------------------+-----------+------+----+
|Sunshine_2009|A team of interna...|  [sci-fi, thriller]|   Sunshine|   3.5|2004|
|  Forest_2005|A team of interna...|[drama, crime, th...|Forest Gump|   3.0|2005|
|  Idiots_2004|Two friends are s...|[drama, comedy, r...|   3 Idiots|   4.5|2004|
+-------------+--------------------+--------------------+-----------+------+----+


In [25]:
df3 = movies_df.filter('rating>3.0').groupby('year').count()
df3.write.json('movies_rating_gt_3.json')