In [1]:
import pandas as pd
import numpy as np

# Charger le dataset
df = pd.read_csv("../data/cleaned_movies.csv")

# Sélectionner les colonnes pertinentes
text_cols = ['Title', 'Overview', 'Tagline', 'Genres', 'Keywords', 'Director', 'Cast']
df = df[text_cols].fillna('')

# Fusionner tout en un seul champ texte
df['text'] = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# On garde l'ID du document
df = df.reset_index().rename(columns={'index': 'doc_id'})
df.head(2)


Unnamed: 0,doc_id,Title,Overview,Tagline,Genres,Keywords,Director,Cast,text
0,0,Inception,"Cobb, a skilled thief who commits corporate es...",Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","[{""id"": 1014, ""name"": ""loss of lover""}, {""id"":...",Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...","Inception Cobb, a skilled thief who commits co..."
1,1,Interstellar,The adventures of a group of explorers who mak...,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","[{""id"": 83, ""name"": ""saving the world""}, {""id""...",Christopher Nolan,"Matthew McConaughey, Jessica Chastain, Anne Ha...",Interstellar The adventures of a group of expl...


In [2]:
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stopwords = nlp.Defaults.stop_words

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and token.lemma_ not in stopwords and len(token) > 2
    ]
    return ' '.join(tokens)

# Appliquer le prétraitement
tqdm.pandas()
df['clean_text'] = df['text'].progress_apply(preprocess_text)

df[['Title', 'clean_text']].head(2)


100%|██████████| 4771/4771 [00:59<00:00, 80.18it/s] 


Unnamed: 0,Title,clean_text
0,Inception,inception cobb skilled thief commit corporate ...
1,Interstellar,interstellar adventure group explorer use newl...


In [3]:
from collections import defaultdict

inverted_index = defaultdict(set)

for idx, text in enumerate(df['clean_text']):
    for word in set(text.split()):
        inverted_index[word].add(idx)

# Exemple d’un terme
list(inverted_index.items())[:5]


[('espionage',
  {0,
   284,
   344,
   346,
   352,
   512,
   561,
   757,
   1219,
   1371,
   1587,
   2291,
   2335,
   2546,
   2858,
   3303,
   3462}),
 ('life',
  {0,
   4096,
   4100,
   5,
   2053,
   4102,
   2056,
   9,
   10,
   4105,
   12,
   13,
   4107,
   15,
   4108,
   4112,
   2066,
   2067,
   4117,
   24,
   26,
   4122,
   2076,
   4123,
   4124,
   4127,
   33,
   34,
   35,
   2084,
   2085,
   4133,
   2087,
   2088,
   41,
   2089,
   43,
   2091,
   45,
   2092,
   4134,
   4136,
   2097,
   2098,
   51,
   4141,
   53,
   4142,
   55,
   4143,
   4144,
   58,
   4148,
   60,
   2109,
   4154,
   63,
   2111,
   4161,
   67,
   2116,
   4167,
   4171,
   77,
   2125,
   2126,
   4174,
   2130,
   83,
   84,
   86,
   87,
   2135,
   2136,
   4184,
   91,
   4187,
   94,
   95,
   2143,
   2144,
   2146,
   4191,
   100,
   101,
   2149,
   2150,
   4192,
   4195,
   2154,
   107,
   4196,
   4197,
   2158,
   4201,
   2160,
   2161,
   4206,
   115,
   421

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])

# Sauvegarde des noms de features (termes)
terms = vectorizer.get_feature_names_out()

print(tfidf_matrix.shape)  # (nb_docs, nb_termes)


(4771, 10000)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, top_n=5):
    query_clean = preprocess_text(query)
    query_vec = vectorizer.transform([query_clean])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    results = df.iloc[top_indices][['Title', 'Overview', 'Genres', 'Director']]
    results['score'] = cosine_sim[top_indices]
    return results

# Exemple
search("tarantino action", top_n=10)

Unnamed: 0,Title,Overview,Genres,Director,score
1711,Grindhouse,Two full-length feature horror movies written ...,"Thriller, Action, Horror",Robert Rodriguez,0.195276
116,Kill Bill: Vol. 2,The Bride unwaveringly continues on her roarin...,"Action, Crime, Thriller",Quentin Tarantino,0.183447
65,Kill Bill: Vol. 1,"An assassin is shot by her ruthless employer, ...","Action, Crime",Quentin Tarantino,0.18095
110,The Hateful Eight,Bounty hunters seek shelter from a raging bliz...,"Drama, Mystery, Western",Quentin Tarantino,0.178753
473,Jackie Brown,Jackie Brown is a flight attendant who gets ca...,Crime,Quentin Tarantino,0.159087
12,Django Unchained,"With the help of a German bounty hunter, a fre...","Drama, Western",Quentin Tarantino,0.152173
8,Pulp Fiction,"A burger-loving hit man, his philosophical par...","Thriller, Crime",Quentin Tarantino,0.140895
109,Reservoir Dogs,A botched robbery indicates a police informant...,"Crime, Thriller",Quentin Tarantino,0.132534
519,From Dusk Till Dawn,Seth Gecko and his younger brother Richard are...,"Horror, Action, Thriller, Crime",Robert Rodriguez,0.125793
25,Inglourious Basterds,"In Nazi-occupied France during World War II, a...","Drama, Thriller, War",Quentin Tarantino,0.103098
