In [13]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load("en_core_web_sm")

In [15]:
import pandas as pd

df = pd.read_csv("../data/cleaned_movies.csv")


In [16]:
def preprocess_spacy(text):
    if not isinstance(text, str):
        return ""
    doc = nlp(text.lower())
    tokens = [
        tok.lemma_
        for tok in doc
        if not tok.is_stop and not tok.is_punct and tok.is_alpha
    ]
    return " ".join(tokens)


Prepare the dataset

In [17]:
df["Title_clean"] = df["Title"].apply(preprocess_spacy)
df["Overview_clean"] = df["Overview"].apply(preprocess_spacy)
df["Genres_clean"] = df["Genres"].apply(preprocess_spacy)
df["Director_clean"] = df["Director"].apply(preprocess_spacy)


In [18]:
df["combined"] = (
    df["Title_clean"] + " " +
    df["Overview_clean"] + " " +
    df["Genres_clean"] + " " +
    df["Director_clean"]
)


In [19]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["combined"])


In [20]:
def search_movie(query, top_n=10):
    query_clean = preprocess_spacy(query)
    query_vec = vectorizer.transform([query_clean])

    scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_idx = scores.argsort()[::-1][:top_n]

    results = df.iloc[top_idx].copy()
    results["score"] = scores[top_idx]
    return results[["Title", "Overview", "Genres", "Director", "score"]]


In [21]:
search_movie("action movie with a policeman", 10)

Unnamed: 0,Title,Overview,Genres,Director,score
2222,Disaster Movie,"The filmmaking team behind the hits ""Scary Mov...","Comedy, Science Fiction",Jason Friedberg,0.309964
1711,Grindhouse,Two full-length feature horror movies written ...,"Thriller, Action, Horror",Robert Rodriguez,0.246036
2233,Osmosis Jones,"A white blood cell policeman, with the help of...","Adventure, Animation, Action, Comedy, Family",Bobby Farrelly,0.23389
4574,My Big Fat Independent Movie,"This film is a spoof along the lines of ""Scary...",Comedy,Philip Zlotorynski,0.218793
2936,Narc,When the trail goes cold on a murder investiga...,"Crime, Drama, Thriller, Mystery",Joe Carnahan,0.216102
1724,The Wailing,A stranger arrives in a little village and soo...,"Horror, Mystery",Na Hong-jin,0.187767
3675,Extreme Movie,A sketch comedy movie about the joys and embar...,Comedy,Adam Jay Epstein,0.174492
3382,Silent Movie,"Aspiring filmmakers Mel Funn, Marty Eggs and D...",Comedy,Mel Brooks,0.174097
2044,Wasabi,Hubert is a French policeman with very sharp m...,"Drama, Action, Comedy",Gérard Krawczyk,0.172561
3907,Tim and Eric's Billion Dollar Movie,Two guys get a billion dollars to make a movie...,Comedy,Tim Heidecker,0.168253


In [22]:
search("tarantino action", top_n=10)

Unnamed: 0,Title,Overview,Genres,Director,score
4406,Excessive Force,Chicago policeman Terry McCain is determined t...,Action,Jon Hess,0.25
3029,Showdown in Little Tokyo,"An American with a Japanese upbringing, Chris ...",Action,Mark L. Lester,0.25
4706,Diamond Ruff,"Action - Orphan, con artist, crime boss and mi...",Action,Unknown,0.25
3546,Pound of Flesh,In China to donate his kidney to his dying nie...,Action,Ernie Barbarash,0.25
4027,Men of War,"Nick Gunar is a burnt-out, jaded and hard-up f...",Action,Perry Lang,0.25
3416,Invasion U.S.A.,A one-man army comes to the rescue of the Unit...,Action,Joseph Zito,0.25
3430,The Corruptor,"Danny is a young cop partnered with Nick, a se...",Action,James Foley,0.25
2131,The Man with the Iron Fists,"In feudal China, a blacksmith who makes weapon...",Action,RZA,0.25
3533,Alatriste,"In 17th century Spain Diego Alatriste, a brave...",Action,Agustín Díaz Yanes,0.25
2603,The Marine,A group of diamond thieves on the run kidnap t...,Action,John Bonito,0.25
