# Content based Recommendation Algorithm 

## Step 1: Import Python Libraries

In [1]:
import streamlit as st
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

## Step 2: Data collection

In [2]:
# Load your dataset
movies = pd.read_csv("movie.csv")  # Replace with your actual movie dataset
ratings = pd.read_csv("ratings_small.csv")  # Replace with your actual ratings dataset

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


Data summary 

In [7]:
# Number of users
print('The ratings dataset has', ratings['userId'].nunique(), 'unique users')

# Number of movies
print('The ratings dataset has', ratings['movieId'].nunique(), 'unique movies')

# Number of ratings
print('The ratings dataset has', ratings['rating'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(ratings['rating'].unique()))

The ratings dataset has 671 unique users
The ratings dataset has 9066 unique movies
The ratings dataset has 10 unique ratings
The unique ratings are [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]


In [8]:
# Merge movies and ratings
movie_ratings = pd.merge(ratings, movies, on='movieId')
movie_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama
...,...,...,...,...,...,...
99338,664,64997,2.5,1343761859,War of the Worlds (2005),Action|Sci-Fi
99339,664,72380,3.5,1344435977,"Box, The (2009)",Drama|Horror|Mystery|Sci-Fi|Thriller
99340,665,129,3.0,995232528,Pie in the Sky (1996),Comedy|Romance
99341,665,4736,1.0,1010197684,Summer Catch (2001),Comedy|Romance


## Step 3: Meging the datasets

In [9]:
# Combine title and genre for content-based filtering
movies['title_and_genre'] =  movies['genres']  + ' ' + movies['title']
movies['title_and_genre'] 

0        Adventure|Animation|Children|Comedy|Fantasy To...
1                Adventure|Children|Fantasy Jumanji (1995)
2                   Comedy|Romance Grumpier Old Men (1995)
3            Comedy|Drama|Romance Waiting to Exhale (1995)
4                Comedy Father of the Bride Part II (1995)
                               ...                        
27273                  Comedy Kein Bund für's Leben (2007)
27274                 Comedy Feuer, Eis & Dosenbier (2002)
27275                         Adventure The Pirates (2014)
27276               (no genres listed) Rentun Ruusu (2001)
27277            Adventure|Fantasy|Horror Innocence (2014)
Name: title_and_genre, Length: 27278, dtype: object

## Step 4: Creating the TF-ID matrix

In [12]:
# Fit a TfidfVectorizer for content-based filtering based on movie title and genres
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_combined = tfidf_vectorizer.fit_transform(movies['title_and_genre'].fillna(''))

## Step 5: Fitting a Nearest Neighbors model

In [13]:
# Fit a Nearest Neighbors model for content-based filtering
knn_model_content = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
knn_model_content.fit(tfidf_matrix_combined)

## Step 6: Movie Recommendations for a given movie

In [15]:
movie_title = 'Toy Story (1995)'

# Get the movieId for the entered movie title
movie_id_user = movies.loc[movies['title'] == movie_title, 'movieId'].values

# Get recommendations

# Content based recommendations
movie_id_user = movie_id_user[0]

# Get the top 10 recommended movies based on content similarity
movie_index_content = movies[movies['movieId'] == movie_id_user].index[0]
distances_content, indices_content = knn_model_content.kneighbors(tfidf_matrix_combined[movie_index_content], n_neighbors=11)
recommended_movies_content = [(movies.iloc[idx]['title'], 1 - distances_content.flatten()[i]) for i, idx in enumerate(indices_content.flatten()[1:])]
recommended_movies_content

[('Toy Story 2 (1999)', 1.0),
 ('Toy Story 3 (2010)', 0.8742364441555872),
 ('Toy Story of Terror (2013)', 0.8144714285535751),
 ('Toy Story That Time Forgot (2014)', 0.6965463300625329),
 ('Toy Story Toons: Small Fry (2011)', 0.5841276637074647),
 ('Toy Story Toons: Hawaiian Vacation (2011)', 0.5794264152382212),
 ('Tin Toy (1988)', 0.5759513873743092),
 ('Toy, The (1982)', 0.5255096708577932),
 ('Toy Story Toons: Partysaurus Rex (2012)', 0.5224032824782244),
 ('Christmas Toy, The (1986)', 0.48430685020474684)]