### Installing Libraries

In [19]:
pip install pandas scikit-learn




### Load the dataset

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def load_dataset(csv_path: str = "movies_dataset.csv") -> pd.DataFrame:
    """
    Loads the dataset into a pandas DataFrame.

    input:
    csv_path - the name of the dataset

    output:
    df - the dataset
    """
    df = pd.read_csv(csv_path)

    #create a combined text column for TF-IDF
    df["combined_text"] = (
        df["original_title"].fillna("") + " " +
        df["overview"].fillna("") + " " +
        df["keywords"].fillna("") + " " +
        df["genres"].fillna("") + " " +
        df["cast"].fillna("") + " " +
        df["director"].fillna("")
    )

    return df

#load dataset
df = load_dataset("movie_dataset.csv")


### Build the TF-IDF Matrix

In [21]:
def build_tfidf_matrix(text_series):
    """
    Fits a TfidfVectorizer on the combined text (overview + keywords + genres + cast + director)
    and transforms them into TF-IDF vectors.

    input:
    text_series - the combined text column

    output:

    vectorizer - the fitted vectorizer
    tfidf_matrix - the TF-IDF matrix
    """
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(text_series)
    return vectorizer, tfidf_matrix

#build TF-IDF
vectorizer, tfidf_matrix = build_tfidf_matrix(df["combined_text"])


### Define the Recommendation Function

In [22]:
def get_recommendations(user_query, df, vectorizer, tfidf_matrix, top_n=5):
    """
    Given a user query, transform it into TF-IDF and compute cosine similarity
    against all items in the dataset. Return the top_n most similar items.

    input:
    user_query - the user query
    df - the dataset
    vectorizer - the fitted vectorizer
    tfidf_matrix - the TF-IDF matrix
    top_n - the number of recommendations to return

    output:
    results - the list of recommendations
    """

    #transform user query to a TF-IDF vector
    query_vec = vectorizer.transform([user_query])

    #compute cosine similarity between query_vec and all movie vectors
    cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()

    #get the indices of the top_n similar items (descending order of similarity)
    similar_indices = cosine_similarities.argsort()[::-1][:top_n]

    #build the final list of recommendations
    results = []
    for idx in similar_indices:
        movie_id = df.index[idx]

        results.append({
            "movie_id": movie_id,
            "original_title": df.iloc[idx]["original_title"],
            "keywords": df.iloc[idx]["keywords"],
            "overview": df.iloc[idx]["overview"],
            "genres": df.iloc[idx]["genres"],
            "similarity_score": float(cosine_similarities[idx])
        })
    return results


### Run the Recommendation System

In [23]:
#user query input
user_query = input("Please enter your movie preference: \n")

#get recommendations
recommendations = get_recommendations(user_query, df, vectorizer, tfidf_matrix, top_n=5)

#display results
print(f"\nUser Query: {user_query}\n")
print(f"Top {len(recommendations)} Recommendations:")
for i, rec in enumerate(recommendations, start=1):
    print(f"{i}. {rec['original_title']} (ID: {rec['movie_id']}, score: {rec['similarity_score']:.4f})")
    print(f"   Keywords: {rec['keywords']}")
    print(f"   Overview: {rec['overview']}")
    print(f"   Genres: {rec['genres']}\n")


Please enter your movie preference: 
I like horror movies with the beach 

User Query: I like horror movies with the beach 

Top 5 Recommendations:
1. Évolution (ID: 474, score: 0.1704)
   Keywords: nurse sea beach boy pregnant
   Overview: 11-year-old Nicolas lives with his mother in a seaside housing estate. The only place that ever sees any activity is the hospital. It is there that all the boys from the village are forced to undergo strange medical trials that attempt to disrupt the phases of evolution.
   Genres: Mystery Drama Horror

2. The Haunting (ID: 451, score: 0.1240)
   Keywords: based on novel trauma castle haunted house insomnia
   Overview: Dr. David Marrow invites Nell Vance, and Theo and Luke Sanderson to the eerie and isolated Hill House to be subjects for a sleep disorder study. The unfortunate guests discover that Marrow is far more interested in the sinister mansion itself – and they soon see the true nature of its horror.
   Genres: Horror Thriller Fantasy Myster