# PySpark Setup

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!cp drive/MyDrive/MMDS-data/spark-3.1.1-bin-hadoop3.2.tgz .
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Task3').getOrCreate()

# Main

## Read data

In [None]:
data_path = 'ratings2k.csv'

In [None]:
df = spark.read \
          .csv(data_path, header=True, inferSchema=True)

## Required Libraries

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import SparseVector
from pyspark.ml.stat import Correlation
from pyspark.sql.types import DoubleType
from pyspark.sql import Window
import random

## Collaborative Filtering

In [None]:
class CollaborativeFiltering:
  def __init__(self, N, dataframe):
    self.N = N # Number of similar users
    self.df = dataframe
    self.utility_matrix = self.to_utility_matrix(self.df)

  def to_utility_matrix(self, dataframe):
    # Exclude column 'index'
    # Group by 'item' to prepare for pivoting
    # Pivot the DataFrame to turn unique users into columns and their ratings into values
    # Aggregate to handle multiple ratings from the same user for an item (if any)
    # Fill in missing values with 0 to indicate the absence of a rating
    um = dataframe.select('user', 'item', 'rating') \
                  .groupBy('item') \
                  .pivot('user') \
                  .agg(F.first('rating')) \
                  .na.fill(0.0) \
                  .sort('item')
    return um

  def predict(self, user_vector, k):
    # Convert SparseVector to create dataframe that the column is that user and rows are their rating for each item
    data = [(float(user_vector[i]) if i in user_vector.indices else 0.0,) for i in range(user_vector.size)]

    window_spec = Window.orderBy(F.lit(1))

    # Create datafame from data above and inner join with the utility_matrix dataframe
    um = spark.createDataFrame(data, ['target']) \
              .withColumn('key', F.row_number().over(window_spec)) \
              .join(self.utility_matrix.withColumn('key', F.row_number().over(window_spec)), on='key', how='inner') \
              .drop('key')

    um = um.select('item', 'target', *um.columns[2:]) # Re-arrange the columns

    # Merge user columns (include 'target') into a vector column
    um = VectorAssembler(inputCols=um.columns[1:],
                         outputCol='features').transform(um)

    # Find correlation matrix of the 'features' column
    pearson = Correlation.corr(um, 'features', 'pearson').head()[0]
    correlation_matrix = pearson.toArray()
    coefficients = correlation_matrix[0, 1:] # Take the coefficients between 'target' and each user
    coefficients = [float(coefficient) for coefficient in coefficients]

    pearson_data = [(index, coefficient) for index, coefficient in zip([i for i in range(1, len(um.columns[1:]))], coefficients)]

    # Sort coefficient in descending order
    pearson_data = sorted(pearson_data,
                          key=lambda x: x[1],
                          reverse=True)

    # If the input user is exists already, exclude it and take first N similar users
    pearson_data = pearson_data[1:][:self.N] if pearson_data[0][1] >= 1 else pearson_data[:self.N]

    # Sort by index to create SparseVector
    pearson_data = sorted(pearson_data,
                          key=lambda x: x[0])

    # Exclude 'item' and 'features' columns
    pearson_sparse_vector = SparseVector(len(um.columns[1:]) - 1, pearson_data)

    sum_of_coefficients = float(pearson_sparse_vector.norm(1))

    def predict_value(features):
      return float(features.dot(pearson_sparse_vector)) / sum_of_coefficients

    predict_udf = F.udf(predict_value, DoubleType())

    res_df = um.withColumn('predictions', predict_udf(F.col('features'))) \
                .select('item', 'predictions') \
                .sort('predictions', ascending=False) \
                .limit(k)

    return res_df

## Testing

In [None]:
def generate(size=467, num_non_zero=25):
  # List of possible values from 0.5 to 5.0 with a step of 0.5
  possible_values = [round(i * 0.5, 1) for i in range(1, 11)]  # [0.5, 1.0, ..., 5.0]

  # Randomly select indices for non-zero entries
  non_zero_indices = random.sample(range(size), num_non_zero)

  # Assign random values to these indices from the possible_values list
  non_zero_values = [random.choice(possible_values) for _ in range(num_non_zero)]

  # Create a dictionary for SparseVector
  index_value_dict = {index: value for index, value in zip(non_zero_indices, non_zero_values)}

  # Create the SparseVector
  user_vector = SparseVector(size, index_value_dict)

  return user_vector

In [None]:
user_vector = generate()
print(user_vector)

(467,[6,17,25,37,75,82,90,96,107,121,159,186,253,265,317,319,322,329,360,368,376,380,421,449,466],[4.0,3.5,1.0,3.5,2.0,3.0,1.0,5.0,1.5,2.0,3.0,1.5,5.0,1.0,2.5,4.5,4.5,5.0,5.0,3.0,3.0,3.0,2.5,1.5,1.5])


In [None]:
cf = CollaborativeFiltering(5, df)

In [None]:
cf.predict(user_vector, 8).show()

+----+------------------+
|item|       predictions|
+----+------------------+
| 144| 3.178234998692067|
| 199| 3.107416922111042|
| 288| 2.867221354370774|
| 322|2.8243941327437434|
| 176| 2.604340838725143|
| 216|2.3222602467952767|
|  36| 2.135240187352537|
| 440|1.9354102995159885|
+----+------------------+

