In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, concatenate
from sklearn.metrics.pairwise import cosine_similarity


# Load the dataset
df = pd.read_csv('TMDB_movie_dataset_v11.csv')

# Handle missing data for 'overview'
df['overview'] = df['overview'].fillna('')

# Assume genres are stored as strings and handle missing or malformed entries
def process_genres(genre_list):
    if isinstance(genre_list, str):
        return genre_list.split(', ')
    elif isinstance(genre_list, list):
        return genre_list
    return []  # Default case if genres are missing or not a string

df['genres'] = df['genres'].apply(process_genres)

# Preprocess the 'overview' text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['overview'])

# Preprocess the 'genres' categorical data
mlb = MultiLabelBinarizer()
genres_matrix = mlb.fit_transform(df['genres'])

# Combine 'overview' TF-IDF features with 'genres' binary features
combined_features = np.hstack((tfidf_matrix.toarray(), genres_matrix))

# Split data into training and test sets
X_train, X_test = train_test_split(combined_features, test_size=0.2, random_state=42)

# Neural Network architecture
input_shape = X_train.shape[1]

# Input Layer
input_layer = Input(shape=(input_shape,))

# Hidden Layers
hidden_layer_1 = Dense(512, activation='relu')(input_layer)
dropout_1 = Dropout(0.5)(hidden_layer_1)
hidden_layer_2 = Dense(256, activation='relu')(dropout_1)
dropout_2 = Dropout(0.5)(hidden_layer_2)

# Output Layer
output_layer = Dense(input_shape, activation='sigmoid')(dropout_2)

# Compile Model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy')

# Train Model
model.fit(X_train, X_train, epochs=5, batch_size=256)

# Get movie representations
movie_representations = model.predict(combined_features)

# Define a function to get the top 5 similar movies using movie_id
def recommend_movies(movie_id, movie_representations, df, top_n=5):
    # Find the index of the movie_id in df
    movie_index = df.index[df['id'] == movie_id].tolist()[0]

    # Compute the similarity scores
    sim_scores = cosine_similarity(movie_representations[movie_index:movie_index+1], movie_representations)[0]
    
    # Get the indices of the top_n movies
    top_indices = sim_scores.argsort()[-top_n-1:-1][::-1]
    
    # Return the top_n similar movies not including the input movie itself
    return df['title'].iloc[top_indices].tolist()

# Example usage: find top 5 movies similar to the movie with ID 27205
recommended_movies = recommend_movies(27205, movie_representations, df)
print(recommended_movies)