In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv("master_dataset_final.csv")  # make sure this file is in your directory

In [2]:
# Creating CountVectorizer object with bigrams and min_df=3
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=3)

In [3]:
# Generate count matrix from 'soup' column
count_matrix = vectorizer.fit_transform(df['soup'])

In [4]:
# Computing cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix)
cosine_df = pd.DataFrame(cosine_sim)
cosine_df.to_csv("cosine_similarity_matrix.csv", index=False)

In [5]:
# Reverse mapping of movie titles to DataFrame indices
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [12]:
# Recommendation function
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in indices.index:
        return f"Movie '{title}' not found in the dataset."

    # Handle potential duplicate titles by taking the first index
    idx = indices[title].iloc[0] if isinstance(indices[title], pd.Series) else indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    movie_indices = [i[0] for i in sim_scores]

    output_df = df.loc[movie_indices, ['title', 'main_director', 'release_date']]
    output_df.columns = ['Movie Title', 'Director', 'Release Date']
    return output_df.reset_index(drop=True)

In [13]:
# Testing
recommendations = get_recommendations("Titanic")
print(recommendations)

                           Movie Title        Director Release Date
0                            True Lies   James Cameron   14-07-1994
1                               Aliens   James Cameron   18-07-1986
2                       The Terminator   James Cameron   26-10-1984
3           Terminator 2: Judgment Day   James Cameron   01-07-1991
4                               Avatar   James Cameron   10-12-2009
5                             Godzilla    Ishirô Honda   03-11-1954
6                               Storks  Doug Sweetland   22-09-2016
7                            Quo Vadis    Mervyn LeRoy   08-11-1951
8       Thunder and the House of Magic     Ben Stassen   24-12-2013
9  A Turtle's Tale: Sammy's Adventures     Ben Stassen   03-08-2010
