In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv("master_dataset_final.csv")  # make sure this file is in your directory

In [5]:
# Creating CountVectorizer object with bigrams and min_df=3
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=3)

In [6]:
# Generate count matrix from 'soup' column
count_matrix = vectorizer.fit_transform(df['soup'])

In [7]:
# Computing cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix)
cosine_df = pd.DataFrame(cosine_sim)
cosine_df.to_csv("cosine_similarity_matrix.csv", index=False)

In [8]:
# Reverse mapping of movie titles to DataFrame indices
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [9]:
# Recommendation function
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in indices:
        return f"Movie '{title}' not found in the dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    movie_indices = [i[0] for i in sim_scores]

    output_df = df.loc[movie_indices, ['title', 'main_director', 'release_date']]
    output_df.columns = ['Movie Title', 'Director', 'Release Date']
    return output_df.reset_index(drop=True)

In [10]:
# Testing
recommendations = get_recommendations("Avatar")
print(recommendations)

                                   Movie Title        Director Release Date
0                                       Aliens   James Cameron   18-07-1986
1                                      Titanic   James Cameron   18-11-1997
2                                    True Lies   James Cameron   14-07-1994
3                               The Terminator   James Cameron   26-10-1984
4                   Terminator 2: Judgment Day   James Cameron   01-07-1991
5                      Star Trek Into Darkness     J.J. Abrams   05-05-2013
6                                         Home     Tim Johnson   18-03-2015
7                                Battle Royale  Kinji Fukasaku   16-12-2000
8          Behind Enemy Lines II: Axis of Evil    James Dodson   17-10-2006
9  Pirates of the Caribbean: On Stranger Tides    Rob Marshall   14-05-2011
