In [None]:
!pip install pyspark
!pip install implicit
!pip install -U cupy-cuda11x
!pip install lightfm
!pip install implicit lightfm



In [46]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from typing import Tuple, List
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
from pathlib import Path
from implicit.nearest_neighbours import bm25_weight
from sklearn.model_selection import train_test_split
from implicit.evaluation import precision_at_k
from lightfm import LightFM
from lightfm.evaluation import precision_at_k as lightfm_precision_at_k

# Initialize Spark session
spark = SparkSession.builder.master("local").appName("CollaborativeFiltering").getOrCreate()

class CollaborativeFiltering:
    def __init__(self, user_artists_file, artists_file):
        self.user_artists_file = user_artists_file
        self.artists_file = artists_file
        self.user_artists = None
        self.artist_names = None
        self.load_data()

    def load_data(self):
        # Read CSV files with Spark
        user_artists_df = spark.read.csv(self.user_artists_file, header=True, inferSchema=True)
        artists_df = spark.read.csv(self.artists_file, header=True, inferSchema=True)

        # Convert Spark DataFrames to Pandas DataFrames
        user_artists_df = user_artists_df.toPandas()
        artists_df = artists_df.toPandas()

        # Process data as before
        self.user_artists = self.create_user_artists_matrix(user_artists_df)
        self.artist_names = dict(zip(artists_df['id'], artists_df['name']))

        # Export user_artist_matrix to DataFrame
        self.user_artist_df = pd.DataFrame(self.user_artists.toarray(), columns=user_artists_df['artistID'].unique(), index=user_artists_df['userID'].unique())

    def create_user_artists_matrix(self, user_artists_df):
        # Process data as before
        unique_users = user_artists_df['userID'].unique()
        unique_artists = user_artists_df['artistID'].unique()

        user_artist_matrix = np.zeros((len(unique_users), len(unique_artists)))

        for _, row in user_artists_df.iterrows():
            user_idx = np.where(unique_users == row['userID'])[0][0]
            artist_idx = np.where(unique_artists == row['artistID'])[0][0]
            user_artist_matrix[user_idx, artist_idx] = row['weight']

        return csr_matrix(user_artist_matrix)

    def get_artist_name_from_id(self, artist_id):
        return self.artist_names.get(artist_id, "Unknown")

class ImplicitRecommender:
    def __init__(self, collaborative_filtering: CollaborativeFiltering, implicit_model: implicit.recommender_base.RecommenderBase):
        self.collaborative_filtering = collaborative_filtering
        self.implicit_model = implicit_model

    def fit(self, user_artists_matrix: csr_matrix) -> None:
        self.implicit_model.fit(user_artists_matrix)

    def recommend(self, user_id: int, user_artists_matrix: csr_matrix, n: int = 5) -> Tuple[List[str], List[float]]:
        artist_ids, scores = self.implicit_model.recommend(user_id, user_artists_matrix.getrow(user_id), N=n)

        # Normalize scores between 0 and 1 using min-max scaling
        min_score, max_score = min(scores), max(scores)
        normalized_scores = [(score - min_score) / (max_score - min_score) for score in scores]

        artists = [self.collaborative_filtering.get_artist_name_from_id(artist_id) for artist_id in artist_ids]
        return artists, normalized_scores

class LightFMRecommender:
    def __init__(self, collaborative_filtering: CollaborativeFiltering, lightfm_model: LightFM):
        self.collaborative_filtering = collaborative_filtering
        self.lightfm_model = lightfm_model

    def fit(self, user_artists_matrix: csr_matrix, epochs: int = 10, learning_rate: float = 0.05, no_components: int = 20, item_alpha: float = 0.1, user_alpha: float = 0.1) -> None:
        # Convert the user-artist matrix to a LightFM dataset
        dataset = LightFM.Dataset()
        dataset.fit(np.arange(user_artists_matrix.shape[0]), np.arange(user_artists_matrix.shape[1]))
        interactions, weights = dataset.build_interactions(user_artists_matrix.T)

        # Fit the LightFM model
        self.lightfm_model.fit(interactions, epochs=epochs, learning_rate=learning_rate, no_components=no_components, item_alpha=item_alpha, user_alpha=user_alpha)

    def recommend(self, user_id: int, user_artists_matrix: csr_matrix, n: int = 5) -> Tuple[List[str], List[float]]:
        # Get recommendations using LightFM model
        lightfm_artists = [self.collaborative_filtering.get_artist_name_from_id(artist) for artist in self.lightfm_model.predict(user_id, np.arange(user_artists_matrix.shape[1]))]

        # Get the ground truth (true positive items)
        true_positives = set(np.where(user_artists_matrix[user_id, :].toarray()[0] > 0)[0])

        # Calculate precision at k
        precision = len(set(lightfm_artists[:n]) & true_positives) / n

        return lightfm_artists[:n], precision

def replace_unknown_with_artist_names(recommendations, collaborative_filtering):
    return [collaborative_filtering.get_artist_name_from_id(artist_id) if artist_id != "Unknown" else "Unknown" for artist_id in recommendations]

def load_and_process_data(user_artists_file, artists_file):
    # Load data
    collaborative_filtering = CollaborativeFiltering(
        user_artists_file=user_artists_file,
        artists_file=artists_file
    )

    # Load user artists data
    user_artists_df = pd.read_csv(collaborative_filtering.user_artists_file)

    # Process data as before
    user_artists_matrix = collaborative_filtering.create_user_artists_matrix(user_artists_df)

    # Use BM25 weighting to normalize the user-artist matrix
    normalized_train_matrix = bm25_weight(user_artists_matrix)

    return collaborative_filtering, user_artists_matrix, normalized_train_matrix

def run_implicit_recommender(collaborative_filtering, normalized_train_matrix, user_id_to_recommend, factors=30, iterations=30, regularization=0.1):
    # Instantiate ALS using implicit
    implicit_model = implicit.als.AlternatingLeastSquares(factors=factors, iterations=iterations, regularization=regularization)

    # Instantiate recommender, fit, and recommend
    recommender = ImplicitRecommender(collaborative_filtering, implicit_model)
    recommender.fit(normalized_train_matrix)

    # Recommend for user
    artists, scores = recommender.recommend(user_id_to_recommend, normalized_train_matrix, n=10)

    # Print results
    print(f"Top {len(artists)} Recommendations for User {user_id_to_recommend}:")
    for rank, (artist, score) in enumerate(zip(artists, scores), start=1):
        print(f"{rank}. {artist}: {score}")

def run_lightfm_recommender(collaborative_filtering, normalized_train_matrix, user_id_to_recommend, epochs=30, learning_rate=0.05, no_components=30, item_alpha=0.1, user_alpha=0.1):
    # Instantiate LightFM model
    lightfm_model = LightFM(loss='warp', learning_rate=learning_rate, no_components=no_components, random_state=42)

    # Convert the user-artist matrix to a LightFM interactions matrix
    lightfm_interactions = csr_matrix(normalized_train_matrix.T)

    # Fit the LightFM model
    lightfm_model.fit(lightfm_interactions, epochs=epochs, num_threads=2)

    # Get recommendations using LightFM model
    num_features_to_use = min(normalized_train_matrix.shape[0], normalized_train_matrix.shape[1])
    lightfm_artists = [collaborative_filtering.get_artist_name_from_id(artist) for artist in lightfm_model.predict(user_id_to_recommend, np.arange(num_features_to_use), num_threads=2)]
    lightfm_scores = lightfm_model.predict(user_id_to_recommend, np.arange(num_features_to_use), num_threads=2)

    # Print top 10 recommendations
    print(f"Top 10 Recommendations for User {user_id_to_recommend}:")
    for rank, (artist, score) in enumerate(zip(lightfm_artists[:10], lightfm_scores[:10]), start=1):
        print(f"{rank}. {artist}: {score}")

    # Calculate precision at k for LightFM
    precision = lightfm_precision_at_k(lightfm_model, lightfm_interactions, k=10, num_threads=2).mean()
    print(f"Precision at 10 for LightFM: {precision}")

def main():
    # Load and process data
    collaborative_filtering, user_artists_matrix, normalized_train_matrix = load_and_process_data(
        user_artists_file="/content/user_artists.csv",
        artists_file="/content/artists.csv"
    )

    # Run implicit recommender for User
    user_id_to_recommend_implicit = 50
    run_implicit_recommender(collaborative_filtering, normalized_train_matrix, user_id_to_recommend_implicit)

    # Run implicit recommender for User with hyperparameter tuning
    user_id_to_recommend_implicit_tuned = 50
    run_implicit_recommender(collaborative_filtering, normalized_train_matrix, user_id_to_recommend_implicit_tuned, regularization=0.01)

    # Run LightFM recommender for User
    user_id_to_recommend_lightfm = 50
    run_lightfm_recommender(collaborative_filtering, normalized_train_matrix, user_id_to_recommend_lightfm)

if __name__ == "__main__":
    main()



  0%|          | 0/30 [00:00<?, ?it/s]

Top 10 Recommendations for User 50:
1. Pentagram: 1.0
2. Greenwheel: 0.5178558826446533
3. Revis: 0.4367997646331787
4. Flagelo Urbano: 0.40846508741378784
5. The Strokes: 0.3661291301250458
6. Aloha From Hell: 0.3069870173931122
7. Unknown: 0.24904806911945343
8. The Rasmus: 0.18815554678440094
9. Zombina and the Skeletones: 0.14770500361919403
10. Жанна Фриске: 0.0




  0%|          | 0/30 [00:00<?, ?it/s]

Top 10 Recommendations for User 50:
1. Jane's Addiction: 1.0
2. Bruce Dickinson: 0.9425491094589233
3. Kirlian Camera: 0.7083753347396851
4. Queen: 0.7080047130584717
5. 2Pac: 0.5496578216552734
6. Calvin Harris: 0.46479931473731995
7. Finch: 0.343287855386734
8. Matryoshka: 0.19423015415668488
9. Greenwheel: 0.14898698031902313
10. Mundo Livre S/A: 0.0
Top 10 Recommendations for User 50:
1. Unknown: -0.6220918297767639
2. Unknown: 1.7737451791763306
3. Unknown: -1.4134398698806763
4. Unknown: -1.09298574924469
5. Unknown: -0.7120912671089172
6. Unknown: -1.926808476448059
7. Unknown: -0.5731458067893982
8. Unknown: -0.0792166218161583
9. Unknown: -0.6854122877120972
10. Unknown: -1.0271462202072144
Precision at 10 for LightFM: 0.19978448748588562
