In [15]:
import pandas as pd
import math
import re
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE  

In [16]:
movies_df=pd.read_csv('movies.csv')
ratings_df=pd.read_csv('ratings.csv')

# Collabrative Filtering

In [17]:
def split_title(title):
    match=re.match(r"^(.*)\s\((\d{4})\)$",title)
    if match:
        return match.groups()
    return title, None


In [18]:
movies_df[['name', 'year']]=movies_df['title'].apply(split_title).apply(pd.Series)

# User Matrix

In [19]:
def user_item_matrix(ratings):
    user_item={}
    for _, row in ratings.iterrows():
        user=row['userId']
        movie=row['movieId']
        rating=row['rating']
        if user not in user_item:
            user_item[user]={}
        user_item[user][movie]=rating
    return user_item

In [20]:
user_matrix=user_item_matrix(ratings_df)

In [21]:
print(user_matrix)

{1.0: {1.0: 4.0, 3.0: 4.0, 6.0: 4.0, 47.0: 5.0, 50.0: 5.0, 70.0: 3.0, 101.0: 5.0, 110.0: 4.0, 151.0: 5.0, 157.0: 5.0, 163.0: 5.0, 216.0: 5.0, 223.0: 3.0, 231.0: 5.0, 235.0: 4.0, 260.0: 5.0, 296.0: 3.0, 316.0: 3.0, 333.0: 5.0, 349.0: 4.0, 356.0: 4.0, 362.0: 5.0, 367.0: 4.0, 423.0: 3.0, 441.0: 4.0, 457.0: 5.0, 480.0: 4.0, 500.0: 3.0, 527.0: 5.0, 543.0: 4.0, 552.0: 4.0, 553.0: 5.0, 590.0: 4.0, 592.0: 4.0, 593.0: 4.0, 596.0: 5.0, 608.0: 5.0, 648.0: 3.0, 661.0: 5.0, 673.0: 3.0, 733.0: 4.0, 736.0: 3.0, 780.0: 3.0, 804.0: 4.0, 919.0: 5.0, 923.0: 5.0, 940.0: 5.0, 943.0: 4.0, 954.0: 5.0, 1009.0: 3.0, 1023.0: 5.0, 1024.0: 5.0, 1025.0: 5.0, 1029.0: 5.0, 1030.0: 3.0, 1031.0: 5.0, 1032.0: 5.0, 1042.0: 4.0, 1049.0: 5.0, 1060.0: 4.0, 1073.0: 5.0, 1080.0: 5.0, 1089.0: 5.0, 1090.0: 4.0, 1092.0: 5.0, 1097.0: 5.0, 1127.0: 4.0, 1136.0: 5.0, 1196.0: 5.0, 1197.0: 5.0, 1198.0: 5.0, 1206.0: 5.0, 1208.0: 4.0, 1210.0: 5.0, 1213.0: 5.0, 1214.0: 4.0, 1219.0: 2.0, 1220.0: 5.0, 1222.0: 5.0, 1224.0: 5.0, 1226.0: 5.0

# Cosine Similarity

In [22]:
def cosine_similarity(vec1,vec2):
    dot_product=sum(vec1[movie]*vec2.get(movie,0) for movie in vec1)
    magnitude_vec1=math.sqrt(sum(val**2 for val in vec1.values()))
    magnitude_vec2=math.sqrt(sum(val**2 for val in vec2.values()))
    if magnitude_vec1==0 or magnitude_vec2==0:
        return 0
    return dot_product/(magnitude_vec1*magnitude_vec2)

# Movie similarity using similar users ratings

In [23]:
def user_similarities(user_item):
    movie_vectors={}
    for user,ratings in user_item.items():
        for movie,rating in ratings.items():
            if movie not in movie_vectors:
                movie_vectors[movie]={}
            movie_vectors[movie][user]=rating
    movie_similarity={}
    for movie1 in movie_vectors:
        movie_similarity[movie1]={}
        for movie2 in movie_vectors:
            if movie1==movie2:
                continue
            similarity=cosine_similarity(movie_vectors[movie1],movie_vectors[movie2])
            movie_similarity[movie1][movie2]=similarity
    return movie_similarity

In [None]:
collabrative_similarity=user_similarities(user_matrix)

# KNN 

In [None]:
def knn_collabrative(movie_id, k=5):
    if movie_id not in collabrative_similarity:
        return []
    similar_movies=sorted(collabrative_similarity[movie_id].items(),key=lambda x:x[1],reverse=True)
    return [movie[0] for movie in similar_movies[:k]]

In [None]:
def collabrative_movies(input_movie, k=5):
    input_movie=input_movie.lower()
    matching_movies=movies_df[movies_df['name'].str.lower()==input_movie.strip()]
    if matching_movies.empty:
        return f"Movie '{input_movie}' not found."
    movie_id=matching_movies.iloc[0]['movieId']  
    recommended_ids=knn_collabrative(movie_id, k)
    recommendations=movies_df[movies_df['movieId'].isin(recommended_ids)][['name','year']]
    return recommendations.to_string(index=False)

In [None]:
input_movie=input("Enter a movie name (with year, e.g., 'Toy Story (1995)'): ")
k=int(input("Enter the number of recommendations you want (e.g., 5): "))

In [None]:
print(f"\nRecommendations for '{input_movie}':")
print(collabrative_movies(input_movie, k))

# Plot of movies

In [None]:
def plot_collaborative_filtering(movie_similarity, movies_df, max_movies=500):
    movie_ids = list(movie_similarity.keys())[:max_movies]  
    num_movies = len(movie_ids)
    similarity_matrix = np.zeros((num_movies, num_movies))

    for i, movie1 in enumerate(movie_ids):
        for j, movie2 in enumerate(movie_ids):
            similarity_matrix[i][j] = movie_similarity[movie1].get(movie2, 0)
    
    distance_matrix = 1 - similarity_matrix
    distance_matrix = np.clip(distance_matrix, 0, None)
    tsne = TSNE(n_components=2, random_state=42, metric="precomputed", init="random")
    reduced_data = tsne.fit_transform(distance_matrix)
    
    plt.figure(figsize=(14, 10))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.7, s=50, cmap='viridis')
    plt.title("Collaborative Filtering: Movie Similarity Clustering")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()


In [None]:
plot_collaborative_filtering(collabrative_similarity, movies_df, max_movies=1000)


# Content Based Filtering

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer()
movies_df['genres']=movies_df['genres'].apply(lambda x:x.split('|'))
genre_matrix=mlb.fit_transform(movies_df['genres'])
genre_df=pd.DataFrame(genre_matrix, columns=mlb.classes_)

In [None]:
print(genre_df)

# Movie Similarity using Genre

In [None]:
def compute_movie_similarities(genre_df):
    movie_similarity={}
    for idx, row in genre_df.iterrows():
        movie_similarity[idx]={}
        for idx2, row2 in genre_df.iterrows():
            if idx==idx2:
                continue
            similarity=cosine_similarity(row.to_dict(),row2.to_dict())
            movie_similarity[idx][idx2]=similarity
    return movie_similarity

In [None]:
content_similarity=compute_movie_similarities(genre_df)

In [None]:
def knn_content(movie_idx,k=5):
    if movie_idx not in content_similarity:
        return []
    similar_movies=sorted(content_similarity[movie_idx].items(),key=lambda x:x[1],reverse=True)
    return [movie[0] for movie in similar_movies[:k]]

# KNN

In [None]:
def content_movies(input_movie, k=5):
    input_movie=input_movie.lower()
    matching_movies=movies_df[movies_df['name'].str.lower()==input_movie.split('(')[0].strip()]
    if '(' in input_movie:  
        year=re.search(r"\((\d{4})\)",input_movie)
        if year:
            matching_movies=matching_movies[matching_movies['year']==year.group(1)]
    if matching_movies.empty:
        return f"Movie '{input_movie}' not found."
    movie_idx=matching_movies.index[0]
    recommended_idxs=knn_content(movie_idx, k)
    recommendations=movies_df.iloc[recommended_idxs][['name','year']]
    return recommendations.to_string(index=False)

In [None]:
input_movie=input("Enter a movie name (without year, e.g., 'Toy Story'): ")
k=int(input("Enter the number of recommendations you want (e.g., 5): "))
print(f"\nRecommendations for '{input_movie}':")
print(content_movies(input_movie, k))

# Plot of movies 

In [None]:
def plot_content_based_filtering(content_similarity, movies_df, max_movies=500):
    movie_indices = list(content_similarity.keys())[:max_movies]  # Limit to top `max_movies`
    num_movies = len(movie_indices)
    similarity_matrix = np.zeros((num_movies, num_movies))
    
    for i, movie1 in enumerate(movie_indices):
        for j, movie2 in enumerate(movie_indices):
            similarity_matrix[i][j] = content_similarity[movie1].get(movie2, 0)
    
    distance_matrix = 1 - similarity_matrix
    distance_matrix = np.clip(distance_matrix, 0, None)
    tsne = TSNE(n_components=2, random_state=42, metric="precomputed", init="random")
    reduced_data = tsne.fit_transform(distance_matrix)
    
    plt.figure(figsize=(14, 10))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.7, s=50, cmap='viridis')
    plt.title("Content-Based Filtering: Movie Similarity Clustering")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()


In [None]:
plot_content_based_filtering(content_similarity, movies_df, max_movies=1000)

In [None]:
def plot_combined_filtering(collab_similarity, content_similarity, movies_df, max_movies=1000):
    collab_movie_ids = list(collab_similarity.keys())[:max_movies]
    content_movie_ids = list(content_similarity.keys())[:max_movies]
    num_collab_movies = len(collab_movie_ids)
    collab_matrix = np.zeros((num_collab_movies, num_collab_movies))
    for i, movie1 in enumerate(collab_movie_ids):
        for j, movie2 in enumerate(collab_movie_ids):
            collab_matrix[i][j] = collab_similarity[movie1].get(movie2, 0)
    collab_distance_matrix = 1 - collab_matrix
    collab_distance_matrix = np.clip(collab_distance_matrix, 0, None)
    num_content_movies = len(content_movie_ids)
    content_matrix = np.zeros((num_content_movies, num_content_movies))
    for i, movie1 in enumerate(content_movie_ids):
        for j, movie2 in enumerate(content_movie_ids):
            content_matrix[i][j] = content_similarity[movie1].get(movie2, 0)
    content_distance_matrix = 1 - content_matrix
    content_distance_matrix = np.clip(content_distance_matrix, 0, None)
    tsne = TSNE(n_components=2, random_state=42, metric="precomputed", init="random")
    collab_reduced = tsne.fit_transform(collab_distance_matrix)
    content_reduced = tsne.fit_transform(content_distance_matrix)
    plt.figure(figsize=(14, 10))
    plt.scatter(collab_reduced[:, 0], collab_reduced[:, 1], alpha=0.7, s=50, c='blue', label='Collaborative Filtering')
    plt.scatter(content_reduced[:, 0], content_reduced[:, 1], alpha=0.7, s=50, c='green', label='Content-Based Filtering')
    plt.title("Overlap of Collaborative and Content-Based Filtering Clusters")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.legend()
    plt.show()

In [None]:
plot_combined_filtering(collabrative_similarity, content_similarity, movies_df, max_movies=1000)