In [None]:
#@title import packages

import pandas as pd
import numpy as np
import random

In [None]:
#@title generate test cases
def generate_data(n_users:int, n_movies:int):
  """
  To generate data for a test case for the Movie Suggestion problem
  Input:
  - n_movies: int, the number of movies
  - n_users: int, the number of users

  Return:
  - A m x n matrix where m is the number of users (n_users)
  and n is the number of movies (n_movies).

  Note:
  - Each entry r_ij is the rating of the user i for the movie j.
  - All ratings are in the range (1, 10) or NA (if the user has not
  watched this movie).
  """
  if not isinstance(n_users, int):
    raise TypeError("The input for n_users must be an integer")
  if not isinstance(n_movies, int):
    raise TypeError("The input for n_movies must be an integer")

  M = np.empty([n_users, n_movies])
  all_ratings = [np.nan, *range(1,11)]

  for user in range(n_users): # for each row
    for movie in range(n_movies): # for each col
      M[user][movie] = random.choice(all_ratings)

  return M

In [None]:
test_matrix = generate_data(n_users=10, n_movies=10)
test_matrix

array([[ 7.,  2.,  7.,  3.,  1.,  1.,  8., nan,  7.,  9.],
       [10.,  8., nan, nan,  5.,  4.,  1., nan,  5.,  5.],
       [ 5., nan,  7., nan,  9.,  9.,  7., 10.,  1.,  3.],
       [ 4., 10., 10.,  6.,  4.,  1.,  7.,  9., 10.,  3.],
       [ 5.,  7.,  5.,  4.,  1.,  8.,  9.,  4.,  2., 10.],
       [ 6.,  4.,  5.,  9.,  4.,  1., nan, 10.,  1.,  8.],
       [ 7., 10.,  3.,  1.,  7.,  9.,  7.,  8., 10.,  1.],
       [ 9.,  5.,  8.,  5.,  7.,  8.,  3.,  2.,  2.,  6.],
       [ 5.,  9.,  2., nan,  2.,  1.,  6.,  3.,  5.,  8.],
       [ 1.,  1.,  7.,  2.,  1.,  7.,  1.,  9.,  4.,  2.]])

In [None]:
#@title generate subset of given size

def get_subsets(arr):
  '''
  This function generates all subsets of a given array.
  Helper function to get_subset_of_size().

  Input:
    arr:  input array to generate subsets from

  Output:
    an array of all subsets
  '''
  if arr == []: return [[]]
  subarray = get_subsets(arr[1:])
  return [[arr[0]] + i for i in subarray] + subarray

# wrapper function of get_subset()
def get_subset_of_size(arr, k):
  '''
  This function generates all subsets of a given size of an input array.

  Input:
    arr:  input array to generate subsets from
    k:    the size of the subsets

  Output:
    an array of all subsets of given size
  '''
  return [x for x in get_subsets(arr) if len(x)==k]

In [None]:
#@title checking pairwise correlation
def check_correlation(matrix, sigma):
  '''
  This function checks if any pairwise correlation coefficient is below a
  given threshold.

  If input matrix contains np.nan values, the nan will be ignored, potentially
  resulting in inaccurate correlation coefficients.

  Input:
    matrix: input matrix with no np.nan value

  Output:
    True if no pairwise correlation coefficient of given matrix is below sigma.
    False if otherwise.
  '''
  corr_matrix = pd.DataFrame(matrix).T.corr().to_numpy()
  bool_matrix = corr_matrix >= sigma
  valid = bool_matrix.all()
  return valid

In [None]:
#@title Main naive algorithm
def naive_movie_suggestion(matrix, m0, sig):
  '''
  This is the naive approach to solve movie recommendation problem.

  Input:
    matrix: input matrix with rows representing users ratings, and columns
            representing movies
    m0:     minimum number of movies, default m0=1
    sig:    minimum pairwise correlation coefficient value (Pearson method),
            default sig=0

  Output:
    the maximum ixj submatrix with at least m0 columns and pairwise row
    correlation coefficient at least sig. if no such submatrix, return [].
  '''
  n_users = len(matrix)
  n_movies = len(matrix[0])
  column_idx = [*range(n_movies)]
  row_idx = [*range(n_users)]

  # getting all submatrix, number of columns at least m0, number of rows at least 2
  subm_nan = []
  for col_size in range(m0, n_movies+1):
    col_ss = get_subset_of_size(column_idx, col_size)

    for col_combo in col_ss:
      current1 = matrix[:, col_combo].copy()

      for row_size in range(2, n_users+1):
        row_ss = get_subset_of_size(row_idx, row_size)

        for row_combo in row_ss:
          current2 = current1[row_combo, :].copy()
          subm_nan.append(current2)

  # dropping all submatrices with nan values
  subm_dropnan = []
  for subm in subm_nan:
    nan_is_here = np.isnan(subm).any()
    if not nan_is_here:
      subm_dropnan.append(subm)

  del subm_nan # freeing memory

  # checking if any submatrix has corr-coef below sigma, remove that submatrix
  # after this, row_submatrix only contains no-nan submatrices that meet the
  # corr_coef threshold
  subm_corr = []
  for subm in subm_dropnan:
    row_correlate = check_correlation(subm, sig)
    if (row_correlate):
      subm_corr.append(subm)

  del subm_dropnan #freeing memory

  #pick the submatrix with largest size in row_submatrix
  largest = []
  max_size = 0
  for subm in subm_corr:
    current_size = np.prod(subm.shape)
    if current_size > max_size:
      max_size = current_size
      largest = subm

  return max_size, largest

In [None]:
naive_movie_suggestion(test_matrix, 1, 0)

(30, array([[ 4., 10., 10.,  6.,  4.,  1.,  7.,  9., 10.,  3.],
        [ 7., 10.,  3.,  1.,  7.,  9.,  7.,  8., 10.,  1.],
        [ 1.,  1.,  7.,  2.,  1.,  7.,  1.,  9.,  4.,  2.]]))