Data from here https://grouplens.org/datasets/movielens/

In [1]:
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}

In [2]:
val genres_df = spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", true)
    .load("/home/jovyan/data/ml-20m/movies.csv")

genres_df = [movieId: int, title: string ... 1 more field]


[movieId: int, title: string ... 1 more field]

In [3]:
genres_df.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



In [4]:
def genre_to_lowercase(genres: String): Array[String] = genres.toLowerCase().split("\\|")

// we use the method name followed by a "_" to indicate we want a reference
// to the method, not call it
val genre_words_to_lowercaseUdf = udf(genre_to_lowercase _)

val genres_df1 = genres_df.withColumn("genre_words_lc", genre_words_to_lowercaseUdf('genres))

genre_words_to_lowercaseUdf = UserDefinedFunction(<function1>,ArrayType(StringType,true),Some(List(StringType)))
genres_df1 = [movieId: int, title: string ... 2 more fields]


genre_to_lowercase: (genres: String)Array[String]


[movieId: int, title: string ... 2 more fields]

In [5]:
genres_df1.show(3)

+-------+--------------------+--------------------+--------------------+
|movieId|               title|              genres|      genre_words_lc|
+-------+--------------------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|[adventure, anima...|
|      2|      Jumanji (1995)|Adventure|Childre...|[adventure, child...|
|      3|Grumpier Old Men ...|      Comedy|Romance|   [comedy, romance]|
+-------+--------------------+--------------------+--------------------+
only showing top 3 rows



We will think in terms of NLP where we will create a vector of the movie titles. This vector is basically a mapping of the movies to the genre vector.

In [6]:
val all_words = genres_df1.select(explode(genres_df1("genre_words_lc")))
val distinct_words = all_words.filter(_ != "").select(all_words("col")).distinct
val total_distinct_words = distinct_words.count.toInt

all_words = [col: string]
distinct_words = [col: string]
total_distinct_words = 20


20

In [7]:
distinct_words.take(5)

0
crime
imax
fantasy
documentary
action


In [8]:
// fit a CountVectorizerModel from the corpus
val cvModel: CountVectorizerModel = new CountVectorizer()
  .setInputCol("genre_words_lc")
  .setOutputCol("features")
  .setVocabSize(total_distinct_words)
  .setMinDF(2)
  .fit(genres_df1)

cvModel = cntVec_41f781ad9121


cntVec_41f781ad9121

In [9]:
val genres_df2 = cvModel.transform(genres_df1)

genres_df2 = [movieId: int, title: string ... 3 more fields]


[movieId: int, title: string ... 3 more fields]

In [10]:
genres_df2.show(3)

+-------+--------------------+--------------------+--------------------+--------------------+
|movieId|               title|              genres|      genre_words_lc|            features|
+-------+--------------------+--------------------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|[adventure, anima...|(20,[1,8,11,13,15...|
|      2|      Jumanji (1995)|Adventure|Childre...|[adventure, child...|(20,[8,11,13],[1....|
|      3|Grumpier Old Men ...|      Comedy|Romance|   [comedy, romance]|(20,[1,3],[1.0,1.0])|
+-------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [43]:
distinct_words.collect.map(_.toSeq)

[WrappedArray(crime), WrappedArray(imax), WrappedArray(fantasy), WrappedArray(documentary), WrappedArray(action), WrappedArray(animation), WrappedArray(mystery), WrappedArray(horror), WrappedArray(film-noir), WrappedArray(musical), WrappedArray(adventure), WrappedArray(drama), WrappedArray((no genres listed)), WrappedArray(western), WrappedArray(children), WrappedArray(war), WrappedArray(romance), WrappedArray(thriller), WrappedArray(sci-fi), WrappedArray(comedy)]

In [42]:
val oneHotDf = distinct_words.collect.map(_.toSeq)
    .foldLeft(genres_df2)(
        (genres_df2, category) => 
            category
//             genres_df2.withColumn("genres_" + category, when(instr(lower(col("genres")), category) === 0, 0).otherwise(1))
    )

Name: Unknown Error
Message: <console>:51: error: type mismatch;
 found   : Seq[Any]
 required: org.apache.spark.sql.DataFrame
    (which expands to)  org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
                   category
                   ^

StackTrace: 

In [None]:
def check_presence(genres: Array[String], category: String):  = genres.toLowerCase().split("\\|")

// we use the method name followed by a "_" to indicate we want a reference
// to the method, not call it
val genre_words_to_lowercaseUdf = udf(genre_to_lowercase _)

val genres_df1 = genres_df.withColumn("genre_words_lc", genre_words_to_lowercaseUdf('genres))

In [30]:
val category = "adventure"
genres_df2.withColumn("genres_" + category, instr(lower(col("genres")), category))
    .select("genres", "genres_adventure")
    .show(10)

+--------------------+----------------+
|              genres|genres_adventure|
+--------------------+----------------+
|Adventure|Animati...|               1|
|Adventure|Childre...|               1|
|      Comedy|Romance|               0|
|Comedy|Drama|Romance|               0|
|              Comedy|               0|
|Action|Crime|Thri...|               0|
|      Comedy|Romance|               0|
|  Adventure|Children|               1|
|              Action|               0|
|Action|Adventure|...|               8|
+--------------------+----------------+
only showing top 10 rows



category = adventure


adventure

In [32]:
val category = "adventure"
genres_df2.withColumn("genres_" + category, when(instr(lower(col("genres")), category) === 0, 0).otherwise(1)).show(10)

+-------+--------------------+--------------------+--------------------+--------------------+----------------+
|movieId|               title|              genres|      genre_words_lc|            features|genres_adventure|
+-------+--------------------+--------------------+--------------------+--------------------+----------------+
|      1|    Toy Story (1995)|Adventure|Animati...|[adventure, anima...|(20,[1,8,11,13,15...|               1|
|      2|      Jumanji (1995)|Adventure|Childre...|[adventure, child...|(20,[8,11,13],[1....|               1|
|      3|Grumpier Old Men ...|      Comedy|Romance|   [comedy, romance]|(20,[1,3],[1.0,1.0])|               0|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|[comedy, drama, r...|(20,[0,1,3],[1.0,...|               0|
|      5|Father of the Bri...|              Comedy|            [comedy]|      (20,[1],[1.0])|               0|
|      6|         Heat (1995)|Action|Crime|Thri...|[action, crime, t...|(20,[2,4,5],[1.0,...|               0|
|

category = adventure


adventure

In [45]:
val prices = List(1.5, 2.0, 2.5)
val sum = prices.foldLeft(0.0)(_ + _)

prices = List(1.5, 2.0, 2.5)
sum = 6.0


6.0

In [46]:
val categories = List("adventure", "animation")
val oneHotDf = categories
    .foldLeft(genres_df2)(
        (genres_df2, category) => 
//             category
            genres_df2.withColumn("genres_" + category, when(instr(lower(col("genres")), category) === 0, 0).otherwise(1))
    )

categories = List(adventure, animation)
oneHotDf = [movieId: int, title: string ... 5 more fields]


[movieId: int, title: string ... 5 more fields]

In [47]:
oneHotDf.show(3)

+-------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+
|movieId|               title|              genres|      genre_words_lc|            features|genres_adventure|genres_animation|
+-------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+
|      1|    Toy Story (1995)|Adventure|Animati...|[adventure, anima...|(20,[1,8,11,13,15...|               1|               1|
|      2|      Jumanji (1995)|Adventure|Childre...|[adventure, child...|(20,[8,11,13],[1....|               1|               0|
|      3|Grumpier Old Men ...|      Comedy|Romance|   [comedy, romance]|(20,[1,3],[1.0,1.0])|               0|               0|
+-------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+
only showing top 3 rows



In [58]:
// distinct_words.select("col")
// //     .filter(r => r(0).asInstanceOf[String])
//     .rdd.map(r => r(0).asInstanceOf[String]).collect()
val distinct_words_list = distinct_words.select("col")
    .map(r => r.getString(0))
    .filter(r => r != "(no genres listed)")
    .collect.toList

distinct_words_list = List(crime, imax, fantasy, documentary, action, animation, mystery, horror, film-noir, musical, adventure, drama, western, children, war, romance, thriller, sci-fi, comedy)


List(crime, imax, fantasy, documentary, action, animation, mystery, horror, film-noir, musical, adventure, drama, western, children, war, romance, thriller, sci-fi, comedy)

In [59]:
val categories = List("adventure", "animation")
val oneHotDf = distinct_words_list
    .foldLeft(genres_df2)(
        (genres_df2, category) => 
//             category
            genres_df2.withColumn("genres_" + category, when(instr(lower(col("genres")), category) === 0, 0).otherwise(1))
    )

categories = List(adventure, animation)
oneHotDf = [movieId: int, title: string ... 22 more fields]


[movieId: int, title: string ... 22 more fields]

In [60]:
oneHotDf.show(3)

+-------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------+------------------+-------------+----------------+--------------+-------------+----------------+--------------+----------------+------------+--------------+---------------+----------+--------------+---------------+-------------+-------------+
|movieId|               title|              genres|      genre_words_lc|            features|genres_crime|genres_imax|genres_fantasy|genres_documentary|genres_action|genres_animation|genres_mystery|genres_horror|genres_film-noir|genres_musical|genres_adventure|genres_drama|genres_western|genres_children|genres_war|genres_romance|genres_thriller|genres_sci-fi|genres_comedy|
+-------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------+------------------+-------------+----------------+--------------+-------------+----------------+--------------+----