<a href="https://colab.research.google.com/github/khodozzz/portfolio/blob/main/7_Recommendation_System_using_KNN_Collab_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from tqdm.notebook import tqdm

# Data Wrangling

In [None]:
movies = pd.read_csv('movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
ratings = pd.read_csv('ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


Filter movies with less than 10,000 ratings to reduce computations

In [None]:
movie_ratings_count = ratings['movieId'].value_counts()
less_than_10_ratings = movie_ratings_count[movie_ratings_count < 10000].index

In [None]:
print(f'Movies total: {len(movie_ratings_count)}')
print(f'Movies after filter: {len(movie_ratings_count) - len(less_than_10_ratings)}')

Movies total: 59047
Movies after filter: 588


In [None]:
filtered_ratings = ratings[~ratings['movieId'].isin(less_than_10_ratings)]
filtered_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
9,1,1250,4.0,1147868414
11,1,1653,4.0,1147868097
...,...,...,...,...
25000086,162541,31658,4.5,1240953287
25000087,162541,33794,4.0,1240951792
25000088,162541,41566,4.0,1240952749
25000090,162541,50872,4.5,1240953372


# User-Item Matrix

In [None]:
class UserItemMatrix:
    def __init__(self, item_id_column='movieId', user_id_column='userId', rating_column='rating'):
        """
        User-Item Matrix class.

        Args:
        - item_id_column: str, column name for item IDs
        - user_id_column: str, column name for user IDs
        - rating_column: str, column name for ratings
        """
        self.item_id_column = item_id_column
        self.user_id_column = user_id_column
        self.rating_column = rating_column

    def fit(self, ratings):
        """
        Create the user-item matrix based on ratings data.

        Args:
        - ratings: DataFrame, ratings data
        """
        # Get unique users and items
        self.unique_users_ = ratings[user_id_column].unique()
        self.unique_items_ = ratings[item_id_column].unique()

        self.n_ratings_ = len(ratings)
        self.n_users_ = len(self.unique_users_)
        self.n_items_ = len(self.unique_items_)

        # Map IDs to indices
        self.user_mapper_ = dict(zip(self.unique_users_, list(range(self.n_users_))))
        self.item_mapper_ = dict(zip(self.unique_items_, list(range(self.n_items_))))

        # Map indices to IDs
        self.user_inv_mapper_ = dict(zip(list(range(self.n_users_)), self.unique_users_))
        self.item_inv_mapper_ = dict(zip(list(range(self.n_items_)), self.unique_items_))

        # Create the user-item matrix using sparse CSR format
        user_index = [self.user_mapper_[i] for i in ratings[user_id_column]]
        item_index = [self.item_mapper_[i] for i in ratings[item_id_column]]

        self.values = csr_matrix((ratings[rating_column], (item_index, user_index)),
                                 shape=(self.n_items_, self.n_users_))

    def to_inner_iid(self, raw_iid):
        """
        Convert raw item ID to inner item ID.

        Args:
        - raw_iid: int, raw item ID

        Returns:
        - inner_iid: int, inner item ID
        """
        return self.item_mapper_[raw_iid]

    def to_inner_uid(self, raw_uid):
        """
        Convert raw user ID to inner user ID.

        Args:
        - raw_uid: int, raw user ID

        Returns:
        - inner_uid: int, inner user ID
        """
        return self.user_mapper_[raw_uid]

    def to_raw_iid(self, inner_iid):
        """
        Convert inner item ID to raw item ID.

        Args:
        - inner_iid: int, inner item ID

        Returns:
        - raw_iid: int, raw item ID
        """
        return self.item_inv_mapper_[inner_iid]

    def to_raw_uid(self, inner_uid):
        """
        Convert inner user ID to raw user ID.

        Args:
        - inner_uid: int, inner user ID

        Returns:
        - raw_uid: int, raw user ID
        """
        return self.user_inv_mapper_[inner_uid]

In [None]:
matrix = UserItemMatrix()
matrix.fit(filtered_ratings)

In [None]:
matrix.values

<588x162109 sparse matrix of type '<class 'numpy.float64'>'
	with 11877943 stored elements in Compressed Sparse Row format>

# K-Nearest Neighbors Collaborative Filtering

In [None]:
class KNNCollaborativeFiltering:
    def __init__(self, k=20, metric='cosine'):
        """
        K-Nearest Neighbors Collaborative Filtering.

        Args:
        - k: int, number of neighbors to consider
        - metric: str, distance metric for nearest neighbors
        """
        self.k = k
        self.metric = metric
        self.model = NearestNeighbors(n_neighbors=self.k + 1,
                                      algorithm='brute',
                                      metric=self.metric)

    def fit(self, X):
        """
        Fit the KNN model with the user-item matrix.

        Args:
        - X: sparse matrix, user-item matrix
        """
        self.X = X

    def find_similar_items(self, item_id, k=None):
        """
        Find similar items based on item ID.

        Args:
        - item_id: int, item ID
        - k: int, number of similar items to retrieve

        Returns:
        - similar_items: list of int, similar item IDs
        """
        if k is None or k > self.k:
            k = self.k

        self.model.fit(self.X)
        item_vec = self.X[item_id]
        neighbors = self.model.kneighbors(item_vec, return_distance=False)
        return [neighbors.item(i) for i in range(1, k+1)]

    def find_similar_users(self, user_id, k=None):
        """
        Find similar users based on user ID.

        Args:
        - user_id: int, user ID
        - k: int, number of similar users to retrieve

        Returns:
        - similar_users: list of int, similar user IDs
        """
        if k is None or k > self.k:
            k = self.k

        self.model.fit(self.X.T)
        item_vec = self.X[:, user_id].T
        neighbors = self.model.kneighbors(item_vec, return_distance=False)
        return [neighbors.item(i) for i in range(1, k+1)]

    def predict_item_ranking(self, user_id, item_id):
        """
        Predict the ranking of an item for a given user.

        Args:
        - user_id: int, user ID
        - item_id: int, item ID

        Returns:
        - item_ranking: float, predicted item ranking
        """
        similar_users_indices = self.find_similar_users(user_id)
        rankings = self.X[item_id, similar_users_indices]
        return rankings.sum() / rankings.count_nonzero()

    def make_recommendations(self, user_id, n=5):
        """
        Make item recommendations for a given user.

        Args:
        - user_id: int, user ID
        - n: int, number of recommendations to make

        Returns:
        - recommendations: list of int, recommended item IDs
        """
        item_scores = []
        for item_id in tqdm(range(self.X.shape[0])):
            if self.X[item_id, user_id] == 0:  # Exclude items already rated by the user
                item_scores.append((item_id, self.predict_item_ranking(user_id, item_id)))
        item_scores.sort(key=lambda x: x[1], reverse=True)
        recommendations = [item[0] for item in item_scores[:n]]
        return recommendations

In [None]:
filtering = KNNCollaborativeFiltering()
filtering.fit(matrix.values)

# Usage

## Similiar Movies

In [None]:
movie_titles = dict(zip(movies['movieId'], movies['title']))
popular_movies = filtered_ratings['movieId'].value_counts().head(10).index

for movie_id in popular_movies:
    movie_inner_id = matrix.to_inner_iid(movie_id)
    similar_movies_ids = filtering.find_similar_items(movie_inner_id, k=5)

    similar_movies_titles = [movie_titles[matrix.to_raw_iid(i)] for i in similar_movies_ids]

    print(f'{movie_titles[movie_id]} : {", ".join(similar_movies_titles)}\n')

Forrest Gump (1994) : Jurassic Park (1993), Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Braveheart (1995)

Shawshank Redemption, The (1994) : Pulp Fiction (1994), Silence of the Lambs, The (1991), Forrest Gump (1994), Schindler's List (1993), Usual Suspects, The (1995)

Pulp Fiction (1994) : Silence of the Lambs, The (1991), Shawshank Redemption, The (1994), Seven (a.k.a. Se7en) (1995), Usual Suspects, The (1995), Forrest Gump (1994)

Silence of the Lambs, The (1991) : Pulp Fiction (1994), Shawshank Redemption, The (1994), Forrest Gump (1994), Seven (a.k.a. Se7en) (1995), Usual Suspects, The (1995)

Matrix, The (1999) : Fight Club (1999), Lord of the Rings: The Fellowship of the Ring, The (2001), Lord of the Rings: The Return of the King, The (2003), Star Wars: Episode V - The Empire Strikes Back (1980), Lord of the Rings: The Two Towers, The (2002)

Star Wars: Episode IV - A New Hope (1977) : Star Wars: Episode V - The Empire Strikes Back (

## Reccomendations

In [None]:
inner_user_id = 0
user_id = matrix.to_raw_uid(inner_user_id)

In [None]:
recommendations = filtering.make_recommendations(inner_user_id)

  0%|          | 0/588 [00:00<?, ?it/s]

  return rankings.sum() / rankings.count_nonzero()


In [None]:
recommendations_titles = [movie_titles[matrix.to_raw_iid(i)] for i in recommendations]

top_rated_movies = filtered_ratings[filtered_ratings['userId'] == user_id] \
                        .sort_values('rating', ascending=False).head(5).index
top_rated_movies_titles = [movie_titles[matrix.to_raw_iid(i)] for i in top_rated_movies]

print(f'Top Rated Films : {", ".join(top_rated_movies_titles)}')
print(f'Recommendations : {", ".join(recommendations_titles)}')

Top Rated Films : Pulp Fiction (1994), Shrek 2 (2004), Star Wars: Episode V - The Empire Strikes Back (1980), Dragonheart (1996), Schindler's List (1993)
Recommendations : Fifth Element, The (1997), Social Network, The (2010), Toy Story (1995), Mr. Holland's Opus (1995), Braveheart (1995)
