In [28]:
import pandas as pd
import numpy as np

# Charger le dataset
df = pd.read_csv("../data/cleaned_movies.csv")

# Sélectionner les colonnes pertinentes
text_cols = ['Title', 'Overview', 'Tagline', 'Genres', 'Keywords', 'Director', 'Cast']
df = df[text_cols].fillna('')

# Fusionner tout en un seul champ texte
df['text'] = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# On garde l'ID du document
df = df.reset_index().rename(columns={'index': 'doc_id'})
df.head(2)


Unnamed: 0,doc_id,Title,Overview,Tagline,Genres,Keywords,Director,Cast,text
0,0,Inception,"Cobb, a skilled thief who commits corporate es...",Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","[{""id"": 1014, ""name"": ""loss of lover""}, {""id"":...",Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...","Inception Cobb, a skilled thief who commits co..."
1,1,Interstellar,The adventures of a group of explorers who mak...,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","[{""id"": 83, ""name"": ""saving the world""}, {""id""...",Christopher Nolan,"Matthew McConaughey, Jessica Chastain, Anne Ha...",Interstellar The adventures of a group of expl...


In [29]:
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stopwords = nlp.Defaults.stop_words

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and token.lemma_ not in stopwords and len(token) > 2
    ]
    return ' '.join(tokens)

# Appliquer le prétraitement
tqdm.pandas()
df['clean_text'] = df['text'].progress_apply(preprocess_text)

df[['Title', 'clean_text']].head(2)


100%|██████████| 4771/4771 [01:00<00:00, 78.93it/s] 


Unnamed: 0,Title,clean_text
0,Inception,inception cobb skilled thief commit corporate ...
1,Interstellar,interstellar adventure group explorer use newl...


In [30]:
from collections import defaultdict

inverted_index = defaultdict(set)

for idx, text in enumerate(df['clean_text']):
    for word in set(text.split()):
        inverted_index[word].add(idx)

# Exemple d’un terme
list(inverted_index.items())[:5]


[('idea',
  {0,
   249,
   252,
   751,
   798,
   906,
   913,
   947,
   1025,
   1079,
   1307,
   1593,
   1679,
   1839,
   2186,
   2281,
   2642,
   2687,
   2792,
   2994,
   3004,
   3217,
   3265,
   3382,
   3493,
   3496,
   3576,
   3680,
   3747,
   3879,
   3888,
   3987,
   3994,
   4079,
   4086,
   4130,
   4291,
   4396,
   4635}),
 ('christopher',
  {0,
   1,
   2,
   22,
   34,
   41,
   45,
   83,
   92,
   101,
   142,
   168,
   205,
   220,
   248,
   281,
   380,
   390,
   393,
   395,
   462,
   477,
   584,
   664,
   693,
   769,
   854,
   878,
   926,
   1013,
   1088,
   1162,
   1216,
   1232,
   1234,
   1247,
   1300,
   1350,
   1353,
   1408,
   1506,
   1526,
   1542,
   1597,
   1618,
   1693,
   1761,
   1868,
   1917,
   2063,
   2133,
   2227,
   2232,
   2270,
   2378,
   2388,
   2438,
   2583,
   2628,
   2694,
   2708,
   2709,
   2805,
   2825,
   2834,
   2867,
   2954,
   2971,
   2981,
   2988,
   3054,
   3075,
   3093,
   3123,
   31

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])

# Sauvegarde des noms de features (termes)
terms = vectorizer.get_feature_names_out()

print(tfidf_matrix.shape)  # (nb_docs, nb_termes)


(4771, 10000)


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, top_n=5):
    query_clean = preprocess_text(query)
    query_vec = vectorizer.transform([query_clean])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    results = df.iloc[top_indices][['Title', 'Overview', 'Genres', 'Director']]
    results['score'] = cosine_sim[top_indices]
    return results

# Exemple
search("action nolan christopher", top_n=10)

Unnamed: 0,Title,Overview,Genres,Director,score
83,The Prestige,A mysterious story of two magicians whose inte...,"Drama, Mystery, Science Fiction",Christopher Nolan,0.220981
664,Insomnia,Two Los Angeles homicide detectives are dispat...,"Thriller, Crime",Christopher Nolan,0.216075
0,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure",Christopher Nolan,0.212081
3531,A Mighty Wind,Director Christopher Guest reunites the team f...,"Comedy, Music",Christopher Guest,0.196105
4004,House Party 2,Kid'N'Play leave their neighborhood and enter ...,Comedy,Doug McHenry,0.186838
101,Memento,Leonard Shelby is tracking down the man who ra...,"Mystery, Thriller",Christopher Nolan,0.178274
34,Batman Begins,"Driven by tragedy, billionaire Bruce Wayne ded...","Action, Crime, Drama",Christopher Nolan,0.168458
4666,Hav Plenty,Lee Plenty is an almost broke would-be novelis...,"Comedy, Romance",Christopher Scott Cherot,0.161761
2867,Somewhere in Time,Young writer Richard Collier is met on the ope...,"Drama, Fantasy, Romance",Jeannot Szwarc,0.146069
2,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller",Christopher Nolan,0.145774


In [47]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(query, top_n=10):
    query_clean = preprocess_text(query)
    query_terms = query_clean.split()
    query_vec = vectorizer.transform([query_clean])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Step 1: compute initial ranking
    top_indices = cosine_sim.argsort()[::-1]
    results = df.iloc[top_indices][['Title', 'Overview', 'Genres', 'Director']].copy()
    results['score'] = cosine_sim[top_indices]

    # Step 2: hard filter — require all query words to appear somewhere in the text
    def contains_all_terms(text):
        text = text.lower()
        return all(term in text for term in query_terms)

    results['matches_all_terms'] = df.loc[top_indices, 'clean_text'].apply(contains_all_terms)

    # Step 3: boost results that satisfy all terms
    results['final_score'] = results.apply(
        lambda x: x['score'] * (1.5 if x['matches_all_terms'] else 1.0),
        axis=1
    )

    # Step 4: re-sort by final score
    results = results.sort_values(by='final_score', ascending=False)

    # Return top N results
    return results.head(top_n)[['Title', 'Genres', 'Director', 'final_score']]

search("tarantino action", top_n=10)




Unnamed: 0,Title,Genres,Director,final_score
1711,Grindhouse,"Thriller, Action, Horror",Robert Rodriguez,0.292913
116,Kill Bill: Vol. 2,"Action, Crime, Thriller",Quentin Tarantino,0.27517
65,Kill Bill: Vol. 1,"Action, Crime",Quentin Tarantino,0.271424
519,From Dusk Till Dawn,"Horror, Action, Thriller, Crime",Robert Rodriguez,0.188689
110,The Hateful Eight,"Drama, Mystery, Western",Quentin Tarantino,0.178753
473,Jackie Brown,Crime,Quentin Tarantino,0.159087
12,Django Unchained,"Drama, Western",Quentin Tarantino,0.152173
8,Pulp Fiction,"Thriller, Crime",Quentin Tarantino,0.140895
109,Reservoir Dogs,"Crime, Thriller",Quentin Tarantino,0.132534
25,Inglourious Basterds,"Drama, Thriller, War",Quentin Tarantino,0.103098
