In [None]:
import pandas as pd
import numpy as np
import random

### Absolutely random
Good for generating dense matrix

In [None]:
def generate_data(n_users:int, n_movies:int, nan=True):
  """
  To generate data for a test case for the Movie Suggestion problem.

  Input:
    n_movies: int, the number of movies
    n_users: int, the number of users

  Return:
    A m x n matrix where m is the number of users (n_users)
    and n is the number of movies (n_movies).

  Note:
    Each entry r_ij is the rating of the user i for the movie j.
    All ratings are in the range (1, 10) or NA (if the user has not
    watched this movie).
  """
  if not isinstance(n_users, int):
    raise TypeError("The input for n_users must be an integer")
  if not isinstance(n_movies, int):
    raise TypeError("The input for n_movies must be an integer")

  M = np.empty([n_users, n_movies])
  all_ratings = None

  if (nan):
    all_ratings = [np.nan, *range(1,11)]
  else:
    all_ratings = [*range(1,11)]

  for user in range(n_users): # for each row
    for movie in range(n_movies): # for each col
      M[user][movie] = random.choice(all_ratings)

  return M


In [None]:
generate_data(7, 5, False)

array([[ 2.,  2.,  9.,  1.,  6.],
       [ 2.,  7.,  8.,  6.,  1.],
       [ 5., 10.,  2.,  2.,  6.],
       [ 3.,  2.,  5.,  8.,  1.],
       [ 9.,  9.,  4.,  9.,  2.],
       [10.,  3.,  9.,  1.,  6.],
       [ 4., 10.,  2.,  8., 10.]])

### Generate matrix with specific density
Good for generating sparse matrix. Using this function to generate dense matrix is really time costly.

In [None]:
def generate_with_density(n_users: int, n_movies: int, density=0.5):
  '''
  This function generates matrices of given shape and density. The density is
  not exact, but close to the desired level. Output matrix is roughy as dense
  as we want it to be.

  A matrix with density of 30% means there are 30% actual numerical entries,
  and 70% are NaN values.

  Input:
    n_users, n_movies: int
      the shape of the matrix, n rows x m columns

    density (optional, default is 0.5): float
      given value must be between 0 and 1 (inclusive). a density of 0.3 means
      ~ 30% of the matrix is numerical value. a density of 1 means the matrix
      has nearly no NaN value.

  Output:
    A matrix of size (n_users x n_movies) with density roughly the same as
    specified value

    The actual density of output matrix isd also returned.
  '''
  if (density < 0) or (density > 1):
    raise TypeError("Density must be a floating point value from 0 to 1 (inclusive)")

  # number of non-NaN values
  number_entry = int(round(density * n_users * n_movies))

  # generate a full nan matrix & convert to numpy array
  M = []
  nan_array = [np.nan] * n_movies
  for i in range(n_users):
    M.append(nan_array)
  matrix = np.asarray(M)

  # getting random entry index & replace it with numerical value
  row_idx = [*range(n_users)]
  col_idx = [*range(n_movies)]
  ratings = [*range(1,11)]
  for i in range(number_entry):
    rand_row = random.choice(row_idx)
    rand_col = random.choice(col_idx)
    matrix[rand_row, rand_col] = random.choice(ratings)

  # counting numerical entries to calculate output matrix density
  num_number = 0
  for i in range(n_users):
    for j in range(n_movies):
      if (matrix[i, j] >= 0) | (matrix[i, j] <= 10) :
        num_number += 1
  actual_density = round(num_number/(n_users * n_movies), 3)

  return matrix, actual_density