### Import the library

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Import the dataset

In [3]:
ratings = pd.read_csv(r'C:\Users\jagan\Documents\ML\datasets\4. recommendation system\ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies = pd.read_csv(r'C:\Users\jagan\Documents\ML\datasets\4. recommendation system\movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Statistical analysis of ratings

In [5]:
print(f"Number of ratings : {len(ratings)}")
print(f"Number of unique movie's Ids : {len(movies['movieId'].unique())}")
print(f"Number of unique users : {len(ratings['userId'].unique())}")
print(f"Average rating per user : {round(len(ratings) / len(ratings['userId'].unique()), 2)}")
print(f"Average rating per movie : {round(len(ratings) / len(ratings['movieId'].unique()), 2)}")

Number of ratings : 100836
Number of unique movie's Ids : 9742
Number of unique users : 610
Average rating per user : 165.3
Average rating per movie : 10.37


### User rating frequency

In [6]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.head()

Unnamed: 0,userId,movieId
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [7]:
user_freq.columns

Index(['userId', 'movieId'], dtype='object')

In [8]:
user_freq.columns = ['userId', 'n_ratings']
user_freq.columns

Index(['userId', 'n_ratings'], dtype='object')

In [9]:
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


### Movie Rating Analysis

In [10]:
# find the lowet and highest rated movies
mean_rating = ratings.groupby('movieId')[['rating']].mean()

# lowest rated movie
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]

# highest rated movie
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

# show number of people who rated movies with rated movie highest
ratings[ratings['movieId'] == highest_rated]

# show number of people who rated movies with rated movie lowest
ratings[ratings['movieId'] == lowest_rated]

# above movie are very low datasets, so we will use beyesian average
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()


In [11]:
movie_stats.head()

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.92093
2,110,3.431818
3,52,3.259615
4,7,2.357143
5,49,3.071429


In [18]:
dict(zip(np.unique(ratings['userId']), list(range(len(ratings['userId'].unique())))))

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18,
 20: 19,
 21: 20,
 22: 21,
 23: 22,
 24: 23,
 25: 24,
 26: 25,
 27: 26,
 28: 27,
 29: 28,
 30: 29,
 31: 30,
 32: 31,
 33: 32,
 34: 33,
 35: 34,
 36: 35,
 37: 36,
 38: 37,
 39: 38,
 40: 39,
 41: 40,
 42: 41,
 43: 42,
 44: 43,
 45: 44,
 46: 45,
 47: 46,
 48: 47,
 49: 48,
 50: 49,
 51: 50,
 52: 51,
 53: 52,
 54: 53,
 55: 54,
 56: 55,
 57: 56,
 58: 57,
 59: 58,
 60: 59,
 61: 60,
 62: 61,
 63: 62,
 64: 63,
 65: 64,
 66: 65,
 67: 66,
 68: 67,
 69: 68,
 70: 69,
 71: 70,
 72: 71,
 73: 72,
 74: 73,
 75: 74,
 76: 75,
 77: 76,
 78: 77,
 79: 78,
 80: 79,
 81: 80,
 82: 81,
 83: 82,
 84: 83,
 85: 84,
 86: 85,
 87: 86,
 88: 87,
 89: 88,
 90: 89,
 91: 90,
 92: 91,
 93: 92,
 94: 93,
 95: 94,
 96: 95,
 97: 96,
 98: 97,
 99: 98,
 100: 99,
 101: 100,
 102: 101,
 103: 102,
 104: 103,
 105: 104,
 106: 105,
 107: 106,
 108: 107,
 109: 108,
 110: 109,
 111: 11

### User item matrix creation

In [19]:
# now we are create the user item matrix using scipy spar matrix

from scipy.sparse import csr_matrix

def create_matrix(df):

    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    # map ids to indices
    user_mapper = dict(zip(np.unique(df['userId']), list(range(N))))
    movie_mapper = dict(zip(np.unique(df['movieId']), list(range(M))))

    # map indices to ids
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df['userId'])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df['movieId'])))


    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[j] for j in df['movieId']]

    X = csr_matrix((df['rating'], (movie_index, user_index)), shape = (M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

- user item matrix is basic data structure in recommendation system
- where M denotes by quantity of distinct films
- where N denoted by quantity of distinct consumers

easy to do calculations and create recommendation system based on the structured representation
of user ratings for movies in the data

### Movie similarity analysis

In [59]:
# find the similar movies by using KNN

from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, k, metric = 'cosine', show_distance = False):

    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k=k+1

    knn = NearestNeighbors(n_neighbors = k, algorithm = 'brute', metric = metric)
    knn.fit(X)

    movie_vec = movie_vec.reshape(1,-1)
    neighbor = knn.kneighbors(movie_vec, return_distance = show_distance)

    for i in range(0, k):
        n = neighbor.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)

    return neighbour_ids


movie_titles = dict(zip(movies['movieId'], movies['title']))
movie_id = 2

similar_ids = find_similar_movies(movie_id, X, k=10)

movie_title = movie_titles[movie_id]

print(f"Since you watched {movie_title}")

for i in similar_ids:
    print(movie_titles[i])
    

Since you watched Jumanji (1995)
Lion King, The (1994)
Mrs. Doubtfire (1993)
Mask, The (1994)
Jurassic Park (1993)
Home Alone (1990)
Nightmare Before Christmas, The (1993)
Aladdin (1992)
Beauty and the Beast (1991)
Ace Ventura: When Nature Calls (1995)
Santa Clause, The (1994)


### Movie recommendation with respect to user preference

In [63]:
# creating function for recommend the movie for the user preference

def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):

    df1 = ratings[ratings['userId'] == user_id]
    
    if df1.empty:
        print(f"User with ID {user_id} does not exist")
        return

    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]
    movie_titles = dict(zip(movies['movieId'], movies['title']))

    similar_ids = find_similar_movies(movie_id, X, k)
    movie_title = movie_titles.get(movie_id, 'Movie not found')
    
    if movie_title == 'Movie not found':
        print(f"Movie with ID {movie_id} not found")
        return
    print(f"Since you watched {movie_title}, you might also like.")
    for i in similar_ids:
        print(movie_titles.get(i, 'Movie not found'))

In [66]:
### Recommend the movies

user_id = 170 # replace with desired user id

recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)


Since you watched Babe (1995), you might also like.
Lion King, The (1994)
Jurassic Park (1993)
Aladdin (1992)
Fugitive, The (1993)
Speed (1994)
Mrs. Doubtfire (1993)
Mask, The (1994)
Beauty and the Beast (1991)
Toy Story (1995)
Ghost (1990)


In [73]:
# try with diffrent user id 
user_id = 3022

recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)

User with ID 3022 does not exist
