In [1]:
!pip install pyspark==3.5.0 findspark

Collecting pyspark==3.5.0
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=030392da4d852e5e114ba190ad13500bc8734313a912a60d44525a5749905419
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: findspark, pyspark
Successfully installed findspark-2.0.1 pyspark-3.5.0


In [2]:
import findspark
findspark.init()

In [3]:
movies_data = [
    {"movie_id": 1, "title": "The Galaxy Quest", "genre": "Sci-Fi/Comedy", "release_date": "1999-12-25"},
    {"movie_id": 2, "title": "Cosmic Wars: Episode X", "genre": "Sci-Fi", "release_date": "2023-05-15"},
    {"movie_id": 3, "title": "Love & Time Travel", "genre": "Romance/Sci-Fi", "release_date": "2022-02-14"},
    {"movie_id": 4, "title": "Dino Park Adventures", "genre": "Action/Family", "release_date": "2021-08-20"},
    {"movie_id": 5, "title": "Interstellar Dreams", "genre": "Sci-Fi/Drama", "release_date": "2018-11-02"},
    {"movie_id": 6, "title": "The Cosmic Comedy Classic", "genre": "Comedy/Sci-Fi", "release_date": "1985-09-20"}
]

reviews_data = [
    {"review_id": 1, "movie_id": 1, "rating": 4.5, "review_text": "A hilarious and surprisingly heartfelt space romp!"},
    {"review_id": 2, "movie_id": 3, "rating": 3.0, "review_text": "Decent effects, but the plot was predictable."},
    {"review_id": 3, "movie_id": 2, "rating": 4.0, "review_text": "Visually stunning, but lacked depth."},
    {"review_id": 4, "movie_id": 6, "rating": 2.5, "review_text": "Some funny moments, but overall felt dated."}  # Review for a new movie
]

box_office_data = [
    {"movie_id": 2, "domestic_gross": 350000000, "international_gross": 580000000},
    {"movie_id": 3, "domestic_gross": 180000000, "international_gross": 120000000},
    {"movie_id": 4, "domestic_gross": 220000000, "international_gross": 300000000},
    {"movie_id": 7, "domestic_gross": 5500000, "international_gross": 12000000}
]


In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MovieExample").getOrCreate()

movies_df = spark.createDataFrame(movies_data)
reviews_df = spark.createDataFrame(reviews_data)
box_office_df = spark.createDataFrame(box_office_data)


In [12]:
joined_df = movies_df.join(box_office_df, on="movie_id")

In [13]:
joined_df.show(truncate=False)

+--------+--------------+------------+----------------------+--------------+-------------------+
|movie_id|genre         |release_date|title                 |domestic_gross|international_gross|
+--------+--------------+------------+----------------------+--------------+-------------------+
|2       |Sci-Fi        |2023-05-15  |Cosmic Wars: Episode X|350000000     |580000000          |
|3       |Romance/Sci-Fi|2022-02-14  |Love & Time Travel    |180000000     |120000000          |
|4       |Action/Family |2021-08-20  |Dino Park Adventures  |220000000     |300000000          |
+--------+--------------+------------+----------------------+--------------+-------------------+



In [9]:
left_join_df = movies_df.join(box_office_df, on="movie_id", how="left")

In [10]:
left_join_df.show(truncate=False)

+--------+--------------+------------+-------------------------+--------------+-------------------+
|movie_id|genre         |release_date|title                    |domestic_gross|international_gross|
+--------+--------------+------------+-------------------------+--------------+-------------------+
|1       |Sci-Fi/Comedy |1999-12-25  |The Galaxy Quest         |NULL          |NULL               |
|3       |Romance/Sci-Fi|2022-02-14  |Love & Time Travel       |180000000     |120000000          |
|2       |Sci-Fi        |2023-05-15  |Cosmic Wars: Episode X   |350000000     |580000000          |
|6       |Comedy/Sci-Fi |1985-09-20  |The Cosmic Comedy Classic|NULL          |NULL               |
|5       |Sci-Fi/Drama  |2018-11-02  |Interstellar Dreams      |NULL          |NULL               |
|4       |Action/Family |2021-08-20  |Dino Park Adventures     |220000000     |300000000          |
+--------+--------------+------------+-------------------------+--------------+-------------------+


In [7]:
full_join_df = movies_df.join(box_office_df, on="movie_id", how="full")

In [8]:
full_join_df.show(truncate=False)

+--------+--------------+------------+-------------------------+--------------+-------------------+
|movie_id|genre         |release_date|title                    |domestic_gross|international_gross|
+--------+--------------+------------+-------------------------+--------------+-------------------+
|1       |Sci-Fi/Comedy |1999-12-25  |The Galaxy Quest         |NULL          |NULL               |
|2       |Sci-Fi        |2023-05-15  |Cosmic Wars: Episode X   |350000000     |580000000          |
|3       |Romance/Sci-Fi|2022-02-14  |Love & Time Travel       |180000000     |120000000          |
|4       |Action/Family |2021-08-20  |Dino Park Adventures     |220000000     |300000000          |
|5       |Sci-Fi/Drama  |2018-11-02  |Interstellar Dreams      |NULL          |NULL               |
|6       |Comedy/Sci-Fi |1985-09-20  |The Cosmic Comedy Classic|NULL          |NULL               |
|7       |NULL          |NULL        |NULL                     |5500000       |12000000           |


In [5]:
right_join_df = movies_df.join(box_office_df, on="movie_id", how="right")

In [6]:
right_join_df.show(truncate=False)

+--------+--------------+------------+----------------------+--------------+-------------------+
|movie_id|genre         |release_date|title                 |domestic_gross|international_gross|
+--------+--------------+------------+----------------------+--------------+-------------------+
|3       |Romance/Sci-Fi|2022-02-14  |Love & Time Travel    |180000000     |120000000          |
|2       |Sci-Fi        |2023-05-15  |Cosmic Wars: Episode X|350000000     |580000000          |
|7       |NULL          |NULL        |NULL                  |5500000       |12000000           |
|4       |Action/Family |2021-08-20  |Dino Park Adventures  |220000000     |300000000          |
+--------+--------------+------------+----------------------+--------------+-------------------+

