In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle

In [2]:
# Load the dataset
dataset = pd.read_csv('dataset.csv')

In [3]:
# Step 1: Data Preprocessing
# Fill missing values in 'overview' and 'genre'
dataset['overview'] = dataset['overview'].fillna('')
dataset['genre'] = dataset['genre'].fillna('')

In [4]:
# Step 2: TF-IDF Vectorization for 'overview' (movie descriptions)
# Limit the number of features to 3000 to keep it memory efficient
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
tfidf_matrix = tfidf.fit_transform(dataset['overview'])

In [5]:
# Step 3: Approximate Nearest Neighbors (ANN)
# Use ANN to efficiently find similar movies without computing full similarity matrix
n_neighbors = 10  # Number of neighbors to consider for each movie
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors)


In [6]:
# Fit the KNN model on the TF-IDF matrix
knn.fit(tfidf_matrix)

In [7]:
# Step 4: Save the KNN model and dataset details for later recommendation
model_data = {
    'knn_model': knn,  # KNN model to find nearest neighbors
    'movie_titles': dataset['title'],  # Movie titles for reference
    'tfidf_matrix': tfidf_matrix,  # Store the TF-IDF matrix for querying
    'indices': pd.Series(dataset.index, index=dataset['title']).to_dict()  # Mapping of titles to indices
}

In [8]:
# Step 5: Save the model to a pickle file
with open('movie_recommendation_knn_model.pkl', 'wb') as f:
    pickle.dump(model_data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
print("Model has been saved as 'movie_recommendation_knn_model.pkl'.")

Model has been saved as 'movie_recommendation_knn_model.pkl'.


In [10]:
def get_recommendations(title, model_data, n_recommendations=5):
    """
    Get movie recommendations based on a given movie title, case-insensitive.
    
    Parameters:
    - title (str): The title of the movie to base the recommendations on.
    - model_data (dict): Dictionary containing the trained KNN model, TF-IDF matrix, and movie information.
    - n_recommendations (int): Number of movie recommendations to return.
    
    Returns:
    - list: List of recommended movie titles.
    """
    # Extract the data from the saved model
    knn = model_data['knn_model']
    movie_titles = model_data['movie_titles']
    tfidf_matrix = model_data['tfidf_matrix']
    indices = model_data['indices']

    # Convert the input title to lowercase
    title = title.lower()

    # Convert all movie titles in the dataset to lowercase for case-insensitive matching
    indices_lower = {k.lower(): v for k, v in indices.items()}

    # Check if the title exists in the dataset (case-insensitive)
    if title not in indices_lower:
        return "Movie not found in the dataset."

    # Get the index of the movie in the dataset
    idx = indices_lower[title]

    # Query the KNN model to find similar movies
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=n_recommendations+1)

    # Get the recommended movie titles (excluding the first one, which is the original movie)
    recommended_movies = [movie_titles.iloc[i] for i in indices.flatten()[1:]]

    return recommended_movies

In [11]:
# Load the model and test the recommendation system
with open('movie_recommendation_knn_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [15]:
# Example usage
recommended_movies = get_recommendations('extraction', loaded_model, n_recommendations=5)
print("Recommended Movies:", recommended_movies)

Recommended Movies: ['Shaft', 'Berserk: The Golden Age Arc III - The Advent', '6 Bullets', '4 Months, 3 Weeks and 2 Days', 'Escape Plan: The Extractors']
