## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline

## Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read Data

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/MovieLens/movies.csv')
ratings = pd.read_csv('/content/drive/MyDrive/MovieLens/ratings.csv')

### Movies

In [None]:
movies.head() # Display movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Movies Data Shape

In [None]:
movies.shape

(87585, 3)

#### Checking Null

In [None]:
movies.isnull().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


#### Check Movies Data

In [None]:
movies.describe()

Unnamed: 0,movieId
count,87585.0
mean,157651.365519
std,79013.402099
min,1.0
25%,112657.0
50%,165741.0
75%,213203.0
max,292757.0


### Ratings

In [None]:
ratings.head() # Display ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


#### Ratings Data Shape

In [None]:
ratings.shape

(32000204, 4)

#### Checking Null

In [None]:
ratings.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


#### Check Ratings Data

In [None]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,32000200.0,32000200.0,32000200.0,32000200.0
mean,100278.5,29318.61,3.540396,1275241000.0
std,57949.05,50958.16,1.058986,256163000.0
min,1.0,1.0,0.5,789652000.0
25%,50053.0,1233.0,3.0,1051012000.0
50%,100297.0,3452.0,3.5,1272622000.0
75%,150451.0,44199.0,4.0,1503158000.0
max,200948.0,292757.0,5.0,1697164000.0


## Data Preprocessing

### Make Year Coloumn From Title

In [None]:
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


### Deleting Year in Title

In [None]:
movies['title'] = movies['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


### Droping Unecessary Column

#### Drop Genres

In [None]:
movies.drop(columns=['genres'], inplace=True) # Drop Genres in Movies
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


#### Drop Timestamps

In [None]:
ratings.drop(columns=['timestamp'], inplace=True) # Drop Timestamp in Ratings
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


The process for creating a user-based recommendation system is as follows :

* Select Users: Choose users who have watched movies.
* Retrieve Viewing Records: Obtain records of the movies watched by these users.
* Calculate Similarity Scores: Compute similarity scores between users based on their movie viewing histories.
* Recommend Items: Recommend movies to users based on the highest similarity scores.

## Modelling & Evaluating

### Select User

In [None]:
# Representing user input data.
user = [
            {'title':'Shutter Island', 'rating':4},
            {'title':'Toy Story', 'rating':2.5},
            {'title':'Home Alone 3', 'rating':3},
            {'title':"Pulp Fiction", 'rating':4.5},
            {'title':'Your Name.', 'rating':5}
         ]
inputMovie = pd.DataFrame(user)
inputMovie

Unnamed: 0,title,rating
0,Shutter Island,4.0
1,Toy Story,2.5
2,Home Alone 3,3.0
3,Pulp Fiction,4.5
4,Your Name.,5.0


### Filter Movie Based on Title

In [None]:
# Filter The Input Based on Title
Id = movies[movies['title'].isin(inputMovie['title'].tolist())]
inputMovie = pd.merge(Id, inputMovie)
inputMovie = inputMovie.drop(labels= 'year',axis = 1)
inputMovie

Unnamed: 0,movieId,title,rating
0,1,Toy Story,2.5
1,296,Pulp Fiction,4.5
2,1707,Home Alone 3,3.0
3,74458,Shutter Island,4.0
4,163134,Your Name.,5.0


### Filter User Based on Movies

In [None]:
users = ratings[ratings['movieId'].isin(inputMovie['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
160,2,296,1.0
380,5,296,1.0
441,7,296,5.0
473,8,296,4.5
559,10,1,2.5


### Users Shape

In [None]:
users.shape

(203901, 3)

### Sub Dataframes

#### Group a DataFrame by userId

In [None]:
userSubsetGroup = users.groupby(['userId'])

In [None]:
userSubsetGroup.get_group(1030) # Example of a group by getting all users of a particular userId

Unnamed: 0,userId,movieId,rating
157668,1030,296,4.0


### Sort users by similarity to prioritize those with the most similar films

#### Sort a list of user groups based on the number of items each group contains.

In [None]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

#### Retrieves the first 3 elements from the userSubsetGroup list

In [None]:
userSubsetGroup[0:3]

[((4392,),
          userId  movieId  rating
  668671    4392        1     4.0
  668724    4392      296     3.5
  669003    4392     1707     1.0
  670744    4392    74458     2.5
  671655    4392   163134     4.0),
 ((7419,),
           userId  movieId  rating
  1149327    7419        1     4.0
  1149334    7419      296     3.0
  1149378    7419     1707     2.5
  1149568    7419    74458     4.5
  1149782    7419   163134     4.5),
 ((8307,),
           userId  movieId  rating
  1288046    8307        1     4.0
  1288177    8307      296     5.0
  1288640    8307     1707     2.0
  1291331    8307    74458     4.0
  1292625    8307   163134     4.0)]

#### Limit the size of the userSubsetGroup list to its first 100 elements

In [None]:
userSubsetGroup = userSubsetGroup[0:100]

### Pearson Correlation

* Store Pearson Correlation in dictionary, where key is user Id and value is coefficient
* Sort the current input and user group so that the values are not mixed up later
* Get review scores for movies
* Store in a deep temporary buffer variable to facilitate future calculations
* Put the current user group reviews in list format
* Calculate pearson correlation between two users, called, x and y

In [None]:
# Initialize an empty dictionary to store Pearson correlation coefficients
pearsonCorDict = {}

# Iterate over groups of users and their movie ratings
for name, group in userSubsetGroup:
    # Sort the group and the input movie data by 'movieId'
    group = group.sort_values(by='movieId')
    inputMovie = inputMovie.sort_values(by='movieId')

    # Get the number of movies in the group
    n = len(group)

    # Filter the input movie data to include only movies that are also in the user's group
    temp = inputMovie[inputMovie['movieId'].isin(group['movieId'].tolist())]

    # Get the list of ratings for the filtered movies
    tempRatingList = temp['rating'].tolist()
    tempGroupList = group['rating'].tolist()

    # Calculate the components of Pearson correlation
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList), 2) / float(n)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList), 2) / float(n)
    Sxy = sum(i * j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList) * sum(tempGroupList) / float(n)

    # Compute the Pearson correlation coefficient
    # Ensure that neither Sxx nor Syy is zero to avoid division by zero
    if Sxx != 0 and Syy != 0:
        pearsonCorDict[name] = Sxy / sqrt(Sxx * Syy)
    else:
        pearsonCorDict[name] = 0


### Retrieve Items From Dictionary

In [None]:
pearsonCorDict.items()

dict_items([((4392,), 0.33101459468767136), ((7419,), 0.31855912392114033), ((8307,), 0.5062592711372335), ((8905,), 0.46502883443982873), ((10280,), 0.6628489803598699), ((12042,), 0.6241272976892847), ((12335,), 0.3509885473947315), ((12667,), 0.3375061807581553), ((13467,), 0.0), ((14425,), -0.44670257372078165), ((14760,), 0.16956177236623018), ((15188,), 0.7167580288225649), ((15875,), 0.7372609114626852), ((16751,), 0.7613188819880647), ((18171,), 0.2067599042433905), ((19347,), 0.7587779803199636), ((21576,), 0.5640416251096729), ((22744,), 0.26413527189768793), ((24072,), 0.48240142859541213), ((25909,), 0.7994259492812168), ((30117,), 0.36589645615870564), ((30162,), 0.6200639257301724), ((34883,), 0.9243367255520292), ((36178,), 0.29531221160930926), ((36270,), 0.14118624160050217), ((36683,), -0.04876920665717847), ((36901,), 0.44670257372078165), ((37678,), 0.5209334529093272), ((39321,), 0.30499714066520944), ((40603,), 0.3774147062120368), ((43639,), 0.8453329153910596), 

### Creates a Dataframe from Dictionary and Reset Index

In [None]:
# Convert to DataFrame
pearsonDF = pd.DataFrame.from_dict(pearsonCorDict, orient='index', columns=['similarityIndex'])

# Reset index
pearsonDF['userId'] = pearsonDF.index.map(lambda x: x[0] if isinstance(x, tuple) else x)
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.331015,4392
1,0.318559,7419
2,0.506259,8307
3,0.465029,8905
4,0.662849,10280


### Identify and view the top 50 users with the highest similarity scores from the pearsonDF DataFrame.

In [None]:
# Sort the DataFrame by 'similarityIndex' in descending order
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]

# Display the first 5 rows of the top 50 sorted users
topUsers.head()


Unnamed: 0,similarityIndex,userId
43,0.970495,58800
22,0.924337,34883
44,0.883506,62722
59,0.849491,79290
30,0.845333,43639


Take a weighted average of movie ratings using Pearson Correlation as the weight. But to do this, Get the movies watched by the user in our pearsonDF from ratings dataframe and then store the correlation in a new column called _similarityIndex".

### Combines the user similarity information with their ratings data, allowing you to see which movies the top similar users have rated.

In [None]:
# Merge the top 50 users with the ratings DataFrame on the 'userId' column
# This combines user similarity data with their respective ratings
topUsersRating = topUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')

# Display the first 5 rows of the merged DataFrame to preview the combined data
topUsersRating.head()


Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.970495,58800,1,3.0
1,0.970495,58800,2,4.5
2,0.970495,58800,12,3.0
3,0.970495,58800,16,4.0
4,0.970495,58800,19,5.0


### Multiplying similarity by user's ratings

In [None]:
# Calculate the weighted rating for each user
# The weighted rating is computed as the product of 'similarityIndex' and 'rating'
topUsersRating['weightedRating'] = topUsersRating['similarityIndex'] * topUsersRating['rating']

# Display the first 5 rows of the DataFrame with the new 'weightedRating' column
topUsersRating.head()


Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.970495,58800,1,3.0,2.911485
1,0.970495,58800,2,4.5,4.367227
2,0.970495,58800,12,3.0,2.911485
3,0.970495,58800,16,4.0,3.88198
4,0.970495,58800,19,5.0,4.852475


### Apply sum to topUsers after grouping them by userId

In [None]:
# Group the top users' ratings by 'movieId' and sum the similarity index and weighted ratings for each movie
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex', 'weightedRating']]

# Rename the columns for better clarity
tempTopUsersRating.columns = ['sum_similarityIndex', 'sum_weightedRating']

# Display the first few rows of the resulting DataFrame
tempTopUsersRating.head()


Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,32.105223,97.409242
2,27.160521,86.439215
3,1.347255,4.041766
4,1.24352,2.9933
5,5.159182,13.131075


## Recommendations

### Create empty dataframe to take the weighted average

In [None]:
# Create an empty DataFrame to store the recommendation results
recommendation_df = pd.DataFrame()

# Calculate the weighted average recommendation score for each movie
# This is done by dividing the sum of weighted ratings by the sum of similarity indices
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating'] / tempTopUsersRating['sum_similarityIndex']

# Add the 'movieId' as a column to the DataFrame
recommendation_df['movieId'] = tempTopUsersRating.index

# Display the first few rows of the recommendation DataFrame
recommendation_df.head()


Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.034062,1
2,3.182532,2
3,3.0,3
4,2.407118,4
5,2.545185,5


### Sort and Display top 10 Movies from Dataframes Based On Their Recommendation Scores.

In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
132182,5.0,132182
158300,5.0,158300
176211,5.0,176211
196717,5.0,196717
78690,5.0,78690
200908,5.0,200908
82143,5.0,82143
99493,5.0,99493
142182,5.0,142182
26012,5.0,26012


### Get the top 10 movies with the highest recommendation score. These are the top 10 recommendations for input users based on what others are watching

In [None]:
movies.loc[movies['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
8523,26012,Samurai III: Duel on Ganryu Island (a.k.a. Bus...,1956
14849,78690,I Accuse,1919
15608,82143,Alone in the Wilderness,2004
19097,99493,Girl Walk: All Day,2011
28768,132182,Alléluia,2014
33242,142182,Wolfy,2009
40306,158300,Sky Of Love,2007
48761,176211,In a Heartbeat,2017
58447,196717,Bernard and the Genie,1991
60390,200908,The Adventures of Pinocchio,1975
