In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("spark_join")\
                  .setMaster("spark://workspace:7077")
sparkContext = SparkContext(conf=conf)
spark = SparkSession(sparkContext=sparkContext)

23/12/23 14:19:58 WARN Utils: Your hostname, workspace resolves to a loopback address: 127.0.0.1; using 220.118.158.128 instead (on interface eno1)
23/12/23 14:19:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/23 14:20:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
csv_dir = "file:///home/hooniegit/git/study/rdd-manufacture/data/movielens/ml-latest"
links = spark.read.csv(f"{csv_dir}/links.csv", header=True, inferSchema=True)
movies = spark.read.csv(f"{csv_dir}/movies.csv", header=True, inferSchema=True)
ratings = spark.read.csv(f"{csv_dir}/ratings.csv", header=True, inferSchema=True)
tags = spark.read.csv(f"{csv_dir}/tags.csv", header=True, inferSchema=True)

links.createOrReplaceTempView("links")
movies.createOrReplaceTempView("movies")
ratings.createOrReplaceTempView("ratings")
tags.createOrReplaceTempView("tags")

                                                                                

In [9]:
tags.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [7]:
df_1 = spark.sql(
    """
        SELECT
            *
        FROM
            movies m
        LEFT JOIN
            ratings r
        ON
            m.movieId = r.movieId
        ORDER BY
            m.movieId DESC  
    """
)
df_1.show()



+-------+--------------------+--------------------+------+-------+------+----------+
|movieId|               title|              genres|userId|movieId|rating| timestamp|
+-------+--------------------+--------------------+------+-------+------+----------+
| 288983|UNZIPPED: An Auto...|         Documentary|254114| 288983|   3.0|1689834886|
| 288977|Skinford: Death S...|      Crime|Thriller|291389| 288977|   3.0|1689815902|
| 288975|The Men Who Made ...|         Documentary|154483| 288975|   4.0|1689812351|
| 288971|  Ouija Japan (2021)|       Action|Horror| 98408| 288971|   0.5|1689798322|
| 288967|State of Siege: T...|        Action|Drama| 47791| 288967|   3.5|1689748357|
| 288965|     Камертон (1979)|             Romance|167321| 288965|   2.5|1689747309|
| 288959|Letters Of Happin...|      Children|Drama|180878| 288959|   2.0|1689723318|
| 288957|Ballet Of Blood (...|              Horror| 11969| 288957|   1.0|1689719936|
| 288955|Agata's Friends (...|               Drama|308174| 288955

                                                                                

In [6]:
df_1.count()

                                                                                

33832162

In [19]:
df_2 = spark.sql(
    """
        SELECT
            m.movieId,
            m.title,
            m.genres,
            l.imdbId,
            l.tmdbId,
            r.userId,
            r.rating,
            r.timestamp,
            t.tag
        FROM
            movies m
        LEFT JOIN
            links l
        ON
            m.movieId = l.movieId
        LEFT JOIN
            ratings r
        ON
            m.movieId = r.movieId
        INNER JOIN
            tags t
        ON
            r.userId = t.userId
            AND
            r.movieId = t.movieId
    """
)
df_2.show()

[Stage 121:>                                                        (0 + 1) / 1]

+-------+--------------------+--------------------+-------+------+------+------+----------+------------------+
|movieId|               title|              genres| imdbId|tmdbId|userId|rating| timestamp|               tag|
+-------+--------------------+--------------------+-------+------+------+------+----------+------------------+
| 111759|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX|1631867|137113|    37|   5.0|1578167954|            action|
| 111759|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX|1631867|137113|    37|   5.0|1578167954|            comedy|
| 111759|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX|1631867|137113|    37|   5.0|1578167954|            sci-fi|
|  30810|Life Aquatic with...|Adventure|Comedy|...| 362270|   421|   137|   4.0|1529288440|       Bill Murray|
|  30810|Life Aquatic with...|Adventure|Comedy|...| 362270|   421|   137|   4.0|1529288440|    Cate Blanchett|
|  30810|Life Aquatic with...|Adventure|Comedy|...| 362270|   421|   137|   4.0|1529288440|  great soundtrack|
|

                                                                                

In [20]:
df_2.count()

                                                                                

1729292

In [21]:
df_2.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- tag: string (nullable = true)



In [22]:
df_2.createOrReplaceTempView("df")

In [23]:
df_3 = spark.sql(
    """
        SELECT
            SUBSTRING(tag FROM 1 FOR 2) AS extracted_chars
        FROM
            df
    """
)
df_3.show()

[Stage 150:>                                                        (0 + 1) / 1]

+---------------+
|extracted_chars|
+---------------+
|             ac|
|             co|
|             sc|
|             Bi|
|             Ca|
|             gr|
|             qu|
|             vi|
|             We|
|             Wi|
|             ci|
|             Bi|
|             ch|
|             lo|
|             lo|
|             ro|
|             ti|
|             to|
|             un|
|             bl|
+---------------+
only showing top 20 rows



                                                                                

In [26]:
df_4 = spark.sql(
    """
        SELECT DISTINCT
            title,
            SPLIT_PART(genres, '|', 1) AS genre
        FROM
            df
    """
)
df_4.show()



+--------------------+------------------+
|               title|             genre|
+--------------------+------------------+
|    Firm, The (1993)|             Drama|
|The Duke of Burgu...|             Drama|
|         Cube (1997)|            Horror|
|Erin Brockovich (...|             Drama|
|Irreversible (Irr...|             Crime|
|Millennium Actres...|         Animation|
|About Schmidt (2002)|            Comedy|
|Bill & Ted's Exce...|         Adventure|
|     Spectral (2016)|(no genres listed)|
|The Accountant (2...|             Crime|
|Rosemary's Baby (...|             Drama|
|Sixth Sense, The ...|             Drama|
|       Saw II (2005)|            Horror|
|   Magic Mike (2012)|             Drama|
|New World (Shin-s...|          Thriller|
|The Lighthouse (2...|             Drama|
|A Man Called Ove ...|            Comedy|
|    Desperado (1995)|            Action|
|      Oddball (2015)|          Children|
|Xiu Xiu: The Sent...|             Drama|
+--------------------+------------

                                                                                

In [28]:
df_5 = spark.sql(
    """
        SELECT DISTINCT
            CONCAT(m.column1, ' ', m.column2) AS concat_result
        FROM 
            (
                SELECT
                    SPLIT_PART(genres, "|", 1) column1,
                    SPLIT_PART(genres, "|", 2) column2
                FROM
                    df
            ) m
    """
)
df_5.show()



+-------------------+
|      concat_result|
+-------------------+
| Comedy Documentary|
|   Fantasy Thriller|
|    Action Children|
|Documentary Mystery|
|           Musical |
|    Animation Crime|
|         Film-Noir |
|         Action War|
|         Adventure |
|  Adventure Mystery|
|    Fantasy Romance|
|   Children Fantasy|
|   Adventure Horror|
|  Animation Mystery|
|    Children Horror|
|      Drama Musical|
|    Horror Thriller|
|       Drama Sci-Fi|
|Documentary Fantasy|
|      Action Comedy|
+-------------------+
only showing top 20 rows



                                                                                

In [32]:
df_6 = spark.sql(
    """
        SELECT
            title,
            genres,
            CASE
                WHEN genres LIKE '%Action%' THEN REPLACE(genres, 'Action', 'UNKNOWN')
                ELSE genres
            END updated_genres
        FROM df;
    """
)
df_6.show()

[Stage 271:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|               title|              genres|      updated_genres|
+--------------------+--------------------+--------------------+
|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX| UNKNOWN|Sci-Fi|IMAX|
|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX| UNKNOWN|Sci-Fi|IMAX|
|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX| UNKNOWN|Sci-Fi|IMAX|
|Life Aquatic with...|Adventure|Comedy|...|Adventure|Comedy|...|
|Life Aquatic with...|Adventure|Comedy|...|Adventure|Comedy|...|
|Life Aquatic with...|Adventure|Comedy|...|Adventure|Comedy|...|
|Life Aquatic with...|Adventure|Comedy|...|Adventure|Comedy|...|
|Life Aquatic with...|Adventure|Comedy|...|Adventure|Comedy|...|
|Life Aquatic with...|Adventure|Comedy|...|Adventure|Comedy|...|
|Life Aquatic with...|Adventure|Comedy|...|Adventure|Comedy|...|
|Killing Them Soft...|Crime|Drama|Thriller|Crime|Drama|Thriller|
|   About Time (2013)|Drama|Fantasy|Rom...|Drama|Fantasy|Rom...|
|   About Time (2013)|Dra

                                                                                

In [35]:
df_7 = spark.sql(
    """
        SELECT
            title,
            CASE
                WHEN genre = "Romance" THEN REPLACE(genre, "Romance", "UNKNOWN")
                ELSE genre
            END genre_fixed
        FROM
            (
                SELECT
                    title,
                    SPLIT_PART(genres, "|", 1) genre
                FROM
                    df
            ) m
    """
)
df_7.show()

[Stage 283:>                                                        (0 + 1) / 1]

+--------------------+-----------+
|               title|genre_fixed|
+--------------------+-----------+
|Edge of Tomorrow ...|     Action|
|Edge of Tomorrow ...|     Action|
|Edge of Tomorrow ...|     Action|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Killing Them Soft...|      Crime|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|Manchester by the...|      Drama|
+--------------------+-----------+
only showing top 20 rows



                                                                                

In [36]:
df_8 = spark.sql(
    """
        SELECT
            title,
            REPLACE(genre, 'Romance', 'UNKNOWN') genre_fixed
        FROM (
                SELECT
                    title,
                    SPLIT_PART(genres, '|', 1) AS genre
                FROM
                    df
            ) m
    """
)
df_8.show()

[Stage 295:>                                                        (0 + 1) / 1]

+--------------------+-----------+
|               title|genre_fixed|
+--------------------+-----------+
|Edge of Tomorrow ...|     Action|
|Edge of Tomorrow ...|     Action|
|Edge of Tomorrow ...|     Action|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Life Aquatic with...|  Adventure|
|Killing Them Soft...|      Crime|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|   About Time (2013)|      Drama|
|Manchester by the...|      Drama|
+--------------------+-----------+
only showing top 20 rows



                                                                                

In [51]:
df_9 = spark.sql(
    """
        SELECT
            timestamp,
            CAST(timestamp AS TIMESTAMP) datetime
        FROM
            df
    """
)
df_9.show()

[Stage 343:>                                                        (0 + 1) / 1]

+----------+-------------------+
| timestamp|           datetime|
+----------+-------------------+
|1578167954|2020-01-05 04:59:14|
|1578167954|2020-01-05 04:59:14|
|1578167954|2020-01-05 04:59:14|
|1529288440|2018-06-18 11:20:40|
|1529288440|2018-06-18 11:20:40|
|1529288440|2018-06-18 11:20:40|
|1529288440|2018-06-18 11:20:40|
|1529288440|2018-06-18 11:20:40|
|1529288440|2018-06-18 11:20:40|
|1529288440|2018-06-18 11:20:40|
|1617154740|2021-03-31 10:39:00|
|1595825008|2020-07-27 13:43:28|
|1595825008|2020-07-27 13:43:28|
|1595825008|2020-07-27 13:43:28|
|1595825008|2020-07-27 13:43:28|
|1595825008|2020-07-27 13:43:28|
|1595825008|2020-07-27 13:43:28|
|1595825008|2020-07-27 13:43:28|
|1595825008|2020-07-27 13:43:28|
|1603769528|2020-10-27 12:32:08|
+----------+-------------------+
only showing top 20 rows



                                                                                

In [49]:
df_9.printSchema()

root
 |-- timestamp: integer (nullable = true)
 |-- datetime: timestamp (nullable = true)



In [61]:
df_new = spark.sql(
    """
        SELECT
            title,
            FROM_UNIXTIME(timestamp) datetime,
            DATE(FROM_UNIXTIME(timestamp)) date,
            YEAR(FROM_UNIXTIME(timestamp)) year,
            MONTH(FROM_UNIXTIME(timestamp)) month,
            DAY(FROM_UNIXTIME(timestamp)) day
        FROM
            df
    """
)
df_new.show()

[Stage 391:>                                                        (0 + 1) / 1]

+--------------------+-------------------+----------+----+-----+---+
|               title|           datetime|      date|year|month|day|
+--------------------+-------------------+----------+----+-----+---+
|Edge of Tomorrow ...|2020-01-05 04:59:14|2020-01-05|2020|    1|  5|
|Edge of Tomorrow ...|2020-01-05 04:59:14|2020-01-05|2020|    1|  5|
|Edge of Tomorrow ...|2020-01-05 04:59:14|2020-01-05|2020|    1|  5|
|Life Aquatic with...|2018-06-18 11:20:40|2018-06-18|2018|    6| 18|
|Life Aquatic with...|2018-06-18 11:20:40|2018-06-18|2018|    6| 18|
|Life Aquatic with...|2018-06-18 11:20:40|2018-06-18|2018|    6| 18|
|Life Aquatic with...|2018-06-18 11:20:40|2018-06-18|2018|    6| 18|
|Life Aquatic with...|2018-06-18 11:20:40|2018-06-18|2018|    6| 18|
|Life Aquatic with...|2018-06-18 11:20:40|2018-06-18|2018|    6| 18|
|Life Aquatic with...|2018-06-18 11:20:40|2018-06-18|2018|    6| 18|
|Killing Them Soft...|2021-03-31 10:39:00|2021-03-31|2021|    3| 31|
|   About Time (2013)|2020-07-27 1

                                                                                

In [62]:
df_new.printSchema()

root
 |-- title: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [64]:
df_10 = spark.sql(
    """
        SELECT 
            *,
            NULL new_column
        FROM
            df
    """
)
df_10.show()

[Stage 403:>                                                        (0 + 1) / 1]

+-------+--------------------+--------------------+-------+------+------+------+----------+------------------+----------+
|movieId|               title|              genres| imdbId|tmdbId|userId|rating| timestamp|               tag|new_column|
+-------+--------------------+--------------------+-------+------+------+------+----------+------------------+----------+
| 111759|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX|1631867|137113|    37|   5.0|1578167954|            action|      NULL|
| 111759|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX|1631867|137113|    37|   5.0|1578167954|            comedy|      NULL|
| 111759|Edge of Tomorrow ...|  Action|Sci-Fi|IMAX|1631867|137113|    37|   5.0|1578167954|            sci-fi|      NULL|
|  30810|Life Aquatic with...|Adventure|Comedy|...| 362270|   421|   137|   4.0|1529288440|       Bill Murray|      NULL|
|  30810|Life Aquatic with...|Adventure|Comedy|...| 362270|   421|   137|   4.0|1529288440|    Cate Blanchett|      NULL|
|  30810|Life Aquatic wi

                                                                                

In [73]:
df_11 = spark.sql(
    """
        SELECT
            userId,
            movieId,
            AVG(rating)
        FROM
            df
        GROUP BY
            CUBE(userId, movieId)
    """
)
df_11.show()



+------+-------+-----------+
|userId|movieId|avg(rating)|
+------+-------+-----------+
|  4776|   4487|        2.0|
|  9138| 105504|        3.0|
| 15869|  58490|        3.5|
| 23936|   1979|        4.0|
| 33191| 103228|        2.0|
| 48392|   1198|        5.0|
| 54964|   4448|        3.5|
| 54976|    318|        4.0|
| 58115| 157645|        3.0|
| 64903|    260|        5.0|
| 65508|   1148|        5.0|
| 67371|  42723|        4.0|
| 68714|   2918|        2.5|
| 74305|   6870|        4.0|
| 75656|  40614|        3.0|
| 97841|   6016|        5.0|
|102081|   2772|        3.5|
|106986|   2321|        5.0|
|109540|    322|        3.5|
|110878|  93991|        4.0|
+------+-------+-----------+
only showing top 20 rows



                                                                                

In [76]:
df_11_2 = spark.sql(
    """
        SELECT
            userId,
            movieId,
            AVG(rating)
        FROM
            df
        WHERE
            movieId IS NULL
        GROUP BY
            CUBE(userId, movieId)
    """
)
df_11_2.show()

[Stage 566:(18 + 12) / 30][Stage 568:(10 + 11) / 21][Stage 569:>  (0 + 1) / 1]1]

+------+-------+-----------+
|userId|movieId|avg(rating)|
+------+-------+-----------+
+------+-------+-----------+



                                                                                

In [78]:
df_11_3 = spark.sql(
    """
        SELECT
            userId,
            movieId,
            AVG(rating)
        FROM
            df
        GROUP BY
            CUBE(userId, movieId)
        HAVING
            movieId IS NULL
    """
)
df_11_3.show()



+------+-------+------------------+
|userId|movieId|       avg(rating)|
+------+-------+------------------+
|  2612|   NULL| 3.745798319327731|
|  7570|   NULL|3.8206140350877194|
|  9138|   NULL|3.6265380778184237|
| 22760|   NULL|              4.76|
| 24124|   NULL|3.7107438016528924|
| 40914|   NULL|3.9656549520766773|
| 53440|   NULL|3.5621783876500857|
| 64075|   NULL|              4.75|
|142955|   NULL|4.3052631578947365|
|153618|   NULL|3.5315315315315314|
|154895|   NULL|               3.5|
|158762|   NULL| 4.642857142857143|
|173607|   NULL|3.7916666666666665|
|186899|   NULL|               5.0|
|190361|   NULL|              3.75|
|191380|   NULL|3.4761904761904763|
|210820|   NULL|             3.875|
|219386|   NULL|3.9004065040650406|
|232216|   NULL|            4.5125|
|247014|   NULL|            4.0625|
+------+-------+------------------+
only showing top 20 rows



                                                                                

In [80]:
df_11_4 = spark.sql(
    """
        SELECT
            *
        FROM
            (
                SELECT
                    userId,
                    movieId,
                    AVG(rating)
                FROM
                    df
                GROUP BY
                    CUBE(userId, movieId)
            )
        WHERE
            userId IS NULL
    """
)
df_11_4.show()



+------+-------+------------------+
|userId|movieId|       avg(rating)|
+------+-------+------------------+
|  NULL|  45950| 2.369688385269122|
|  NULL| 149334| 3.959051724137931|
|  NULL|  87232| 3.065356004250797|
|  NULL|   3101| 3.378640776699029|
|  NULL| 217465|3.5713101160862353|
|  NULL|  47518|3.7976190476190474|
|  NULL|  70336|1.3766114180478821|
|  NULL|  77328| 2.783333333333333|
|  NULL|    741| 4.334033613445378|
|  NULL| 103042| 3.388246628131021|
|  NULL|    342|3.8897637795275593|
|  NULL|   3696|3.1066666666666665|
|  NULL| 177867| 3.607142857142857|
|  NULL|   7790|              3.75|
|  NULL|   2353|3.9385245901639343|
|  NULL|   2183| 3.686991869918699|
|  NULL|    616|3.6370967741935485|
|  NULL|  71322|3.3666666666666667|
|  NULL| 216671|               3.0|
|  NULL| 131714|               3.3|
+------+-------+------------------+
only showing top 20 rows



                                                                                

In [79]:
df_12 = spark.sql(
    """
        SELECT
            userId,
            movieId,
            AVG(rating)
        FROM
            df
        GROUP BY
            ROLLUP(userId, movieId)
        HAVING
            movieId IS NULL
    """
)
df_12.show()



+------+-------+------------------+
|userId|movieId|       avg(rating)|
+------+-------+------------------+
|  2612|   NULL| 3.745798319327731|
|  7570|   NULL|3.8206140350877194|
|  9138|   NULL|3.6265380778184237|
| 14960|   NULL|               4.5|
| 15830|   NULL|              4.75|
| 24124|   NULL|3.7107438016528924|
| 40914|   NULL|3.9656549520766773|
| 53440|   NULL|3.5621783876500857|
|102058|   NULL|            4.8125|
|117314|   NULL|3.6914893617021276|
|124416|   NULL|            3.6875|
|142221|   NULL|               5.0|
|153618|   NULL|3.5315315315315314|
|158762|   NULL| 4.642857142857143|
|168788|   NULL|               4.1|
|172736|   NULL| 4.815384615384615|
|173640|   NULL|               4.5|
|175141|   NULL| 3.923076923076923|
|195858|   NULL|  4.87719298245614|
|210820|   NULL|             3.875|
+------+-------+------------------+
only showing top 20 rows



                                                                                

In [84]:
# OUTER JOIN 사용 시, AVG(r.rating) 값이 없을 수도 있으므로 userId 기준으로 movieId를 GroupBy를 해 주어야 함
df_13 = spark.sql(
    """
        SELECT
            r.userId,
            m.movieId,
            AVG(r.rating)
        FROM
            movies m
        LEFT JOIN
            ratings r
        ON
            m.movieId = r.movieId
        GROUP BY
            r.userId, m.movieId
    """
)
df_13.show()

[Stage 642:>                                                        (0 + 1) / 1]

+------+-------+-----------+
|userId|movieId|avg(rating)|
+------+-------+-----------+
|114125|     31|        3.0|
|114151|     31|        4.0|
|114172|     31|        3.0|
|114282|     31|        3.0|
|114283|     31|        3.5|
|114299|     31|        4.0|
|114301|     31|        4.0|
|114352|     31|        3.5|
|114398|     31|        2.0|
|114406|     31|        3.0|
|114411|     31|        3.0|
|114449|     31|        5.0|
|114451|     31|        3.0|
|114493|     31|        0.5|
|114517|     31|        3.0|
|114550|     31|        3.0|
|114684|     31|        3.0|
|114688|     31|        3.0|
|114694|     31|        5.0|
|114812|     31|        4.0|
+------+-------+-----------+
only showing top 20 rows



                                                                                

In [87]:
# OUTER JOIN 사용 시, AVG(r.rating) 값이 없을 수도 있으므로 GroupBy를 해 주어야 함
df_13_2 = spark.sql(
    """
        SELECT
            r.userId,
            m.movieId,
            r.rating
        FROM
            movies m
        LEFT JOIN
            ratings r
        ON
            m.movieId = r.movieId
    """
)
df_13.show()

[Stage 652:>                                                        (0 + 1) / 1]

+------+-------+-----------+
|userId|movieId|avg(rating)|
+------+-------+-----------+
| 23231|     31|        4.0|
| 23250|     31|        4.0|
| 23302|     31|        5.0|
| 23323|     31|        3.5|
| 23329|     31|        3.0|
| 23331|     31|        3.0|
| 23366|     31|        4.0|
| 23404|     31|        3.0|
| 23480|     31|        3.0|
| 23511|     31|        4.0|
| 23535|     31|        4.0|
| 23537|     31|        4.5|
| 23543|     31|        3.0|
| 23544|     31|        4.0|
| 23561|     31|        3.0|
| 23567|     31|        3.5|
| 23571|     31|        3.0|
| 23595|     31|        5.0|
| 23699|     31|        4.0|
| 23768|     31|        4.5|
+------+-------+-----------+
only showing top 20 rows



                                                                                

In [88]:
spark.stop()