In [2]:
import numpy as np
import pandas as pd

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies = movies.merge(credits,on='title')


In [5]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]


In [6]:
import ast
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L
movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)


In [7]:
movies['keywords'] = movies['keywords'].apply(convert)


In [8]:
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L
movies['cast'] = movies['cast'].apply(convert)


In [9]:
movies['cast'] = movies['cast'].apply(lambda x:x[0:3])
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L
movies['crew'] = movies['crew'].apply(fetch_director)

In [10]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [11]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

vector = cv.fit_transform(new['tags']).toarray()

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [13]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [19]:
recommend('Avatar')

Titan A.E.
Small Soldiers
Ender's Game
Aliens vs Predator: Requiem
Independence Day


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(new['tags'])

# Compute cosine similarity between all pairs of movies
cosine_sim = cosine_similarity(tfidf_matrix)

def recommend1(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(cosine_sim[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [18]:
recommend1('Avatar')

Aliens
Battle: Los Angeles
Falcon Rising
Star Trek Into Darkness
Apollo 18


In [15]:
#Kmeans Clustering on vetorised data
from sklearn.cluster import KMeans

# Let's assume we have the vectorized movie features in the variable 'X'
# Tf is your movie feature matrix (e.g., TF-IDF or Count Vectorized matrix)
new1 = new.copy()
Tf = tfidf_matrix
# Number of clusters = 5

# Fit KMeans on the feature matrix
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(Tf.toarray())  # Convert sparse matrix to dense if needed

# Get the cluster labels
new1['cluster'] = kmeans.labels_

# Function to get similar movies from the same cluster
def recommend_movies_kmeans(movie_title, num_recommendations=5):
    # Find the cluster of the input movie
    movie_cluster = new1[new1['title'] == movie_title]['cluster'].values[0]

    # Get the movies in the same cluster
    similar_movies = new1[new1['cluster'] == movie_cluster]

    # Remove the input movie from the similar movies list
    similar_movies = similar_movies[similar_movies['title'] != movie_title]

    # Return top N similar movies (you can sort by features like vote average, popularity, etc. if you want)
    print(similar_movies[ 'title'].head(num_recommendations))

# Example: Get top 5 movie recommendations for movie with ID 9889
recommend_movies_kmeans('Avatar')


2                               Spectre
3                 The Dark Knight Rises
4                           John Carter
7               Avengers: Age of Ultron
9    Batman v Superman: Dawn of Justice
Name: title, dtype: object


In [16]:
#KNN neighbours using cosine similarity
# Importing necessary libraries
from sklearn.neighbors import NearestNeighbors
new2 = new.copy()
Y= tfidf_matrix

# Fit the KNN model on the feature matrix
knn = NearestNeighbors(n_neighbors=6, metric='cosine')  # n_neighbors=6 to include the input movie itself
knn.fit(Y.toarray())  # Convert sparse matrix to dense if needed

# Function to get similar movies using KNN
def recommend_movies_knn(movie_title, num_recommendations=5):
    # Get the index of the movie based on its title
    movie_idx = new2[new2['title'] == movie_title].index[0]
    movie_vector = Y[movie_idx].toarray()

    # Find the nearest neighbors
    distances, indices = knn.kneighbors(movie_vector, n_neighbors=num_recommendations + 1)  # +1 to include the input movie itself

    # Get the recommended movie indices (excluding the input movie)
    recommended_movie_indices = indices[0][1:]  # Skip the first index (which is the movie itself)
    recommended_movies =new2.iloc[recommended_movie_indices]['title']

    print(recommended_movies)

# Example: Get top 5 movie recommendations for a movie with title "Shallow Hal"
recommend_movies_knn('Avatar')

#Using the Manhattan & Minkowski Distances produced bad results when recommending movies simialr to Avatar


2405                     Aliens
582         Battle: Los Angeles
3729              Falcon Rising
47      Star Trek Into Darkness
3607                  Apollo 18
Name: title, dtype: object


In [19]:
#Using Autoencoders
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Example: Assume movie_data contains the movie features (e.g., TF-IDF or CountVectorizer features)
# movie_data is a DataFrame with movie_id and vectorized features like genres, keywords, etc.
Z = tfidf_matrix
new3 = new.copy()
# Normalize the data
scaler = MinMaxScaler()
Z_scaled = scaler.fit_transform(Z.toarray())  # Z is your movie feature matrix (e.g., TF-IDF)

# Define Autoencoder Architecture
input_dim = Z_scaled.shape[1]  # Number of features (e.g., TF-IDF vector size)
encoding_dim = 50  # Dimension of the latent space (bottleneck)

# Define the model
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)

# Encoder model for movie feature extraction
encoder = Model(input_layer, encoded)

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mse')

# Train the autoencoder model
autoencoder.fit(Z_scaled, Z_scaled, epochs=10, batch_size=256, shuffle=True)

# Use the encoder to get the compressed representation of the movies
encoded_movies = encoder.predict(Z_scaled)

# Function to recommend movies based on cosine similarity of the encoded representations
def recommend_movies_autoencoder(movie_title, num_recommendations=5):
    movie_idx = new3[new3['title'] == movie_title].index[0]

    # Get the encoded representation for the input movie
    movie_vector = encoded_movies[movie_idx].reshape(1, -1)

    # Calculate cosine similarity between the input movie and all other movies
    similarities = cosine_similarity(movie_vector, encoded_movies)

    # Get indices of the top recommended movies (excluding the input movie itself)
    similar_movies_idx = similarities[0].argsort()[-(num_recommendations+1):-1]

    recommended_movies = new3.iloc[similar_movies_idx][ 'title']

    print(recommended_movies)

# Example: Get top 5 movie recommendations for movie with ID 9889
recommend_movies_autoencoder('Avatar')


Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 351ms/step - loss: 0.2488
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 287ms/step - loss: 0.2357
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 359ms/step - loss: 0.1475
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 326ms/step - loss: 0.0322
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 290ms/step - loss: 0.0069
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 290ms/step - loss: 0.0035
Epoch 7/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 482ms/step - loss: 0.0024
Epoch 8/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 532ms/step - loss: 0.0019
Epoch 9/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 286ms/step - loss: 0.0017
Epoch 10/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 343ms/step 

In [None]:
#As per thre results I feel the best results are coming from KNN Neighbours with cosine similarity measure as the metric of evaluation