In [1]:
import unittest

class TestLoadMovies(unittest.TestCase):
  """Test load_movies notebook"""
  
  # ---------------------
  # Define test case constants

  movies_path = "dbfs:/FileStore/movielens/test-data/movies.json"
  ratings_path = "dbfs:/FileStore/movielens/test-data/ratings.json"
  target_table = "test_load_movies"
  
  # ---------------------
  # Define input fixtures
  
  # Movies with zero, one and many ratings.
  fixture_movies_json = """
        {"movieId": 9999, "title": "Unknow within", "genres": "Mistery|Thriller"}
        {"movieId": 6548, "title": "Bad Boys II (2003)", "genres": "Action|Comedy|Crime|Thriller"}
        {"movieId": 145, "title": "Bad Boys (1995)", "genres": "Action|Comedy|Crime|Drama|Thriller"}
        """
  # Three ratings with corresponding movies and one rating without movie
  fixture_ratings_json = """
        {"userId": 6, "movieId": 145, "rating": 4.0, "timestamp": 845553966}
        {"userId": 21, "movieId": 145, "rating": 3.0, "timestamp": 1376823316}
        {"userId": 18, "movieId": 6548, "rating": 2.0, "timestamp": 1455050951}
        {"userId": 92, "movieId": 999999, "rating": 5.0, "timestamp": 1455053951}
        """
  # ---------------------
  # Define helpers

  def get_loaded_movies(self, movies_table):
    "Load content of the movies table"
    spark.catalog.refreshTable(movies_table)
    movies_df = table(movies_table).cache()
    return movies_df
  
  # The setUp method is called before each test.
  # Use it to setup sandbox.
  def setUp(self):
    spark.sql(f"DROP TABLE IF EXISTS {self.target_table}")
    
  # Define helpers for setting up test data 
  def given_test_data(self):
    "Prepare test data for load_movies notebook"
    dbutils.fs.put(self.movies_path, self.fixture_movies_json, overwrite=True)
    dbutils.fs.put(self.ratings_path, self.fixture_ratings_json, overwrite=True)

  # Stimulus (actions) can be defined as methods.
  # This is the place where we run the notebook.
  def when_load_movies(self):
    "Run the load_movies notebook"
    dbutils.notebook.run("load_movies", timeout_seconds=180, arguments={
       "movies_data": self.movies_path,
       "ratings_data": self.ratings_path,
       "target_table": self.target_table,
    })
    
  # All methods starting with "test" are tests, executed by unittest.
  # use the test method to build your test scenario:
  #   GIVEN ... (setup)
  #   WHEN  ... (action or stimulus)
  #   THEN  ... (validation or assert)
  def test_movies_loaded(self):
    "Happy path scenario for load_movies notebook"
    
    # GIVEN test data is present 
    
    # WHEN load_movies notebook is executed
    self.when_load_movies()
    
    # THEN ...
    actual_df = self.get_loaded_movies(self.target_table)
    self.assertEqual(actual_df.count(), 4, 'Each movie has a record')
    
    self.assertEqual(actual_df.where("movieId = 6548").count(), 1, "movie with one ratings")
    self.assertEqual(actual_df.where("movieId = 145").count(), 1, "movie with two ratings")
    self.assertEqual(actual_df.where("movieId = 9999").count(), 1, "movie without ratings")
    self.assertEqual(actual_df.where("movieId IS NULL").count(), 0, "rating mapped to zero movies should fill movieId")
    self.assertEqual(actual_df.where("title IS NULL").count(), 1, "rating mapped to zero movies should not fill title")

    self.assertEqual(actual_df.where("movieId = 145").select("num_ratings").collect()[0][0], 2, "num_ratings for movie with two ratings")
    self.assertEqual(actual_df.where("movieId = 145").select("min_rating").collect()[0][0], 3, "min_rating for movie with one two ratings")
    self.assertEqual(actual_df.where("movieId = 145").select("max_rating").collect()[0][0], 4, "max_rating for movie with one two ratings")
    self.assertEqual(actual_df.where("movieId = 145").select("average_rating").collect()[0][0], 3.5, "average_rating for movie with one two ratings is rounded")

    self.assertEqual(actual_df.where("movieId = 6548").select("num_ratings").collect()[0][0], 1, "num_ratings for movie with one ratings")
    self.assertEqual(actual_df.where("movieId = 6548").select("min_rating").collect()[0][0], 2, "min_rating for movie with one ratings")
    self.assertEqual(actual_df.where("movieId = 6548").select("max_rating").collect()[0][0], 2, "max_rating for movie with one ratings")
    self.assertEqual(actual_df.where("movieId = 6548").select("average_rating").collect()[0][0], 2, "average_rating for movie with one")

    self.assertEqual(actual_df.where("movieId = 9999").select("num_ratings").collect()[0][0], 0, "num_ratings for movie with zero ratings")
    self.assertEqual(actual_df.where("movieId = 9999").select("min_rating").collect()[0][0], None, "min_rating for movie with zero ratings")
    self.assertEqual(actual_df.where("movieId = 9999").select("max_rating").collect()[0][0], None, "max_rating for movie with zero ratings")
    self.assertEqual(actual_df.where("movieId = 9999").select("average_rating").collect()[0][0], None, "average_rating for movie with zero ratings")

    self.assertEqual(actual_df.where("movieId = 999999").select("num_ratings").collect()[0][0], 1, "num_ratings for rating without movie")
    self.assertEqual(actual_df.where("movieId = 999999").select("min_rating").collect()[0][0], 5, "min_rating for rating without movie")
    self.assertEqual(actual_df.where("movieId = 999999").select("max_rating").collect()[0][0], 5, "max_rating for rating without movie")
    self.assertEqual(actual_df.where("movieId = 999999").select("average_rating").collect()[0][0], 5, "average_rating for rating without movie")


In [2]:
# Use unittest to execute test cases  
suite = unittest.TestLoader().loadTestsFromTestCase(TestLoadMovies)
runner = unittest.TextTestRunner(verbosity=2)
runner.run(suite)