In [6]:
import os
import json
import math
import re
import numpy as np
import pandas as pd
from collections import defaultdict
import spacy
import glob

In [7]:
# Fonction pour charger tous les JSON d’un dossier
def load_movies_from_json_folder(folder_path):
    json_pattern = os.path.join(folder_path, "*.json")
    json_files = glob.glob(json_pattern)
    
    if not json_files:
        raise ValueError(f"Aucun fichier .json trouvé dans {folder_path}")
    
    print(f"{len(json_files)} fichiers JSON trouvés. Chargement en cours...")
    
    records = []
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            records.append(data)
        except Exception as e:
            print(f"Erreur avec {file_path} : {e}")
    
    df = pd.DataFrame(records)
    
    # Colonnes obligatoires (on crée les manquantes vides)
    required = ['Title', 'Director', 'Genres', 'Overview', 'Release_Date']
    for col in required:
        if col not in df.columns:
            df[col] = ""
    
    # Créer un ID stable (très utile quand on recharge)
    if 'id' not in df.columns:
        df = df.reset_index(drop=True)
        df['id'] = df.index
    
    df = df.set_index('id')
    
    print(f"Chargement terminé : {len(df)} films chargés")
    return df

In [8]:
class SmartSearchEngine:
    def __init__(self, df, load_from_file=None):
        self.df = df.copy()
        self.N = len(df)
        
        print("Chargement du modèle spaCy...")
        self.nlp = spacy.load("en_core_web_sm")
        
        self.fields = ['Title', 'Director', 'Genres', 'Overview', 'Release_Date']
        
        if load_from_file:
            self.load_index(load_from_file)
        else:
            self.inverted_index = defaultdict(lambda: defaultdict(list))
            self.doc_lengths = {}
            self.avg_doc_length = {}
            self.directors_set = set()
            self.genres_set = set()
            self.title_words = set()
            self.years_set = set()
            
            self.build_index()
            self.build_recognition_dicts()
    
    # -------------------------------------------------
    def preprocess_text(self, text, is_date=False):
        if pd.isna(text):
            return []
        text = str(text).lower()
        if is_date:
            return re.findall(r'\b(?:19|20)\d{2}\b', text)
        doc = self.nlp(text)
        return [token.lemma_ for token in doc
                if not token.is_stop and not token.is_punct and not token.is_space
                and len(token.lemma_) > 2 and token.is_alpha]
    
    # -------------------------------------------------
    def build_index(self):
        print("Construction de l'index inversé...")
        for field in self.fields:
            self.doc_lengths[field] = {}
            self.avg_doc_length[field] = 0
        
        for idx, row in self.df.iterrows():
            for field in self.fields:
                tokens = self.preprocess_text(row[field], is_date=(field == 'Release_Date'))
                self.doc_lengths[field][idx] = len(tokens)
                freqs = defaultdict(int)
                for t in tokens:
                    freqs[t] += 1
                for term, freq in freqs.items():
                    self.inverted_index[field][term].append((idx, freq))
        
        for field in self.fields:
            lengths = self.doc_lengths[field].values()
            if lengths:
                self.avg_doc_length[field] = np.mean(list(lengths))
        
        print(f"Index construit : {self.N} documents")
    
    # -------------------------------------------------
    def build_recognition_dicts(self):
        print("Construction des dictionnaires de reconnaissance...")
        for director in self.df['Director'].dropna():
            self.directors_set.update(self.preprocess_text(director))
        for genres in self.df['Genres'].dropna():
            for g in str(genres).split(','):
                self.genres_set.update(self.preprocess_text(g.strip()))
        for title in self.df['Title'].dropna():
            self.title_words.update(self.preprocess_text(title))
        for date in self.df['Release_Date'].dropna():
            self.years_set.update(self.preprocess_text(date, is_date=True))
    
    # -------------------------------------------------
    def classify_query_terms(self, query_tokens):
        classified = {'director':[], 'genre':[], 'title':[], 'year':[], 'general':[]}
        for term in query_tokens:
            if re.match(r'^(?:19|20)\d{2}$', term):
                classified['year'].append(term)
            elif term in self.directors_set:
                classified['director'].append(term)
            elif term in self.genres_set:
                classified['genre'].append(term)
            elif term in self.title_words:
                classified['title'].append(term)
            else:
                classified['general'].append(term)
        return classified
    
    # -------------------------------------------------
    def bm25_score(self, term, doc_id, field, k1=1.5, b=0.75):
        if term not in self.inverted_index[field]:
            return 0.0
        postings = self.inverted_index[field][term]
        tf = next((f for d, f in postings if d == doc_id), 0)
        if tf == 0:
            return 0.0
        dfreq = len(postings)
        idf = math.log((self.N - dfreq + 0.5) / (dfreq + 0.5) + 1.0)
        doc_len = self.doc_lengths[field].get(doc_id, 0)
        avg_len = self.avg_doc_length.get(field, 1) or 1
        norm = 1 - b + b * (doc_len / avg_len)
        return idf * (tf * (k1 + 1)) / (tf + k1 * norm)
    
    # -------------------------------------------------
    def search(self, query, top_n=10):
        years = re.findall(r'\b(?:19|20)\d{2}\b', query)
        tokens = self.preprocess_text(query) + years
        if not tokens:
            return pd.DataFrame()
        
        classified = self.classify_query_terms(tokens)
        print("\n=== Classification ===")
        for cat, terms in classified.items():
            if terms: print(f"{cat.capitalize():8}: {terms}")
        
        candidates = set()
        mapping = {
            'director': ['Director'],
            'genre':    ['Genres'],
            'title':    ['Title'],
            'year':     ['Release_Date'],
            'general':  self.fields
        }
        for cat, terms in classified.items():
            for term in terms:
                for field in mapping[cat]:
                    if term in self.inverted_index[field]:
                        candidates.update(d for d, _ in self.inverted_index[field][term])
        
        scores = {}
        for doc_id in candidates:
            s = 0.0
            for t in classified['director']: s += self.bm25_score(t, doc_id, 'Director')
            for t in classified['genre']:    s += self.bm25_score(t, doc_id, 'Genres')
            for t in classified['title']:    s += self.bm25_score(t, doc_id, 'Title')
            for t in classified['year']:     s += self.bm25_score(t, doc_id, 'Release_Date')
            for t in classified['general']:
                for f in self.fields:
                    s += self.bm25_score(t, doc_id, f)
            if s > 0:
                scores[doc_id] = s
        
        top = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
        if not top:
            return pd.DataFrame()
        
        ids, sc = zip(*top)
        res = self.df.loc[list(ids), ['Title','Overview','Genres','Director','Release_Date']].copy()
        res['score'] = sc
        return res.reset_index(drop=True)
    
    # -------------------------------------------------
    def save_index(self, folder_path="../data/index_data"):
        os.makedirs(folder_path, exist_ok=True)
        print(f"Sauvegarde de l'index dans {folder_path}...")
        
        # Correction ici : on utilise bien "terms" et pas "terms_dict"
        serializable_index = {}
        for field, terms in self.inverted_index.items():
            serializable_index[field] = {}
            for term, postings in terms.items():
                serializable_index[field][term] = [[int(d), int(f)] for d, f in postings]
        
        with open(os.path.join(folder_path, "inverted_index.json"), 'w', encoding='utf-8') as f:
            json.dump(serializable_index, f, ensure_ascii=False)
        
        metadata = {
            'N': self.N,
            'doc_lengths': {f: {int(k): v for k, v in d.items()} for f, d in self.doc_lengths.items()},
            'avg_doc_length': self.avg_doc_length,
            'directors_set': list(self.directors_set),
            'genres_set': list(self.genres_set),
            'title_words': list(self.title_words),
            'years_set': list(self.years_set),
            'fields': self.fields
        }
        with open(os.path.join(folder_path, "metadata.json"), 'w', encoding='utf-8') as f:
            json.dump(metadata, f, ensure_ascii=False)
        
        print("Index sauvegardé avec succès !")
    
    # -------------------------------------------------
    def load_index(self, folder_path="../data/index_data"):
        print(f"Chargement de l'index depuis {folder_path}...")
        idx_path = os.path.join(folder_path, "inverted_index.json")
        meta_path = os.path.join(folder_path, "metadata.json")
        
        with open(idx_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        self.inverted_index = defaultdict(lambda: defaultdict(list))
        for field, terms in data.items():
            for term, postings in terms.items():
                self.inverted_index[field][term] = [(int(d), int(f)) for d, f in postings]
        
        with open(meta_path, 'r', encoding='utf-8') as f:
            meta = json.load(f)
        
        self.N = meta['N']
        self.doc_lengths = {f: {int(k): v for k, v in d.items()} for f, d in meta['doc_lengths'].items()}
        self.avg_doc_length = meta['avg_doc_length']
        self.directors_set = set(meta['directors_set'])
        self.genres_set = set(meta['genres_set'])
        self.title_words = set(meta['title_words'])
        self.years_set = set(meta['years_set'])
        self.fields = meta['fields']
        print("Index chargé !")

In [12]:
# ============ UTILISATION AVEC VRAIES DONNÉES ============
# Chemin vers ton dossier contenant les .json (un film par fichier)
JSON_FOLDER = "../data/docs/"   # Change ici si besoin

df = load_movies_from_json_folder(JSON_FOLDER)

print(f"\nDataset chargé : {len(df)} films")
print("Colonnes :", df.columns.tolist())

50 fichiers JSON trouvés. Chargement en cours...
Chargement terminé : 50 films chargés

Dataset chargé : 50 films
Colonnes : ['Title', 'Overview', 'Tagline', 'Homepage', 'Release_Date', 'Vote_Average', 'Runtime', 'Poster_Path', 'Genres', 'Keywords', 'Director', 'budget', 'revenue', 'production_companies', 'Cast', 'clean_text']


In [10]:
# Créer le moteur de recherche
engine = SmartSearchEngine(df)
engine.save_index("../data/index")

Chargement du modèle spaCy...
Construction de l'index inversé...
Index construit : 50 documents
Construction des dictionnaires de reconnaissance...
Sauvegarde de l'index dans ../data/index...
Index sauvegardé avec succès !


In [11]:
# ============ TESTS ============
results = engine.search("action movies of christopher nolan 2010", top_n=10)
print(results[['Title' , 'Director','Genres', 'Release_Date', 'score']])


=== Classification ===
Director: ['christopher', 'nolan']
Genre   : ['action']
Year    : ['2010']
General : ['movie']
                                           Title           Director  \
0                                      Inception  Christopher Nolan   
1                                  Batman Begins  Christopher Nolan   
2                                The Dark Knight  Christopher Nolan   
3                          The Dark Knight Rises  Christopher Nolan   
4                                   Interstellar  Christopher Nolan   
5                                     Iron Man 2        Jon Favreau   
6                           Inglourious Basterds  Quentin Tarantino   
7                                 Shutter Island    Martin Scorsese   
8  The Lord of the Rings: The Return of the King      Peter Jackson   
9                                     The Matrix    Lilly Wachowski   

                               Genres Release_Date     score  
0  Action, Science Fiction, Adventur

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

class SearchEngineEvaluator:
    """
    Évaluateur robuste pour le SmartSearchEngine
    Gère correctement les index (int ou object), seuils, et cas vides
    """
    def __init__(self, search_engine, df):
        self.engine = search_engine
        self.df = df.reset_index()  
    
    def create_evaluation_dataset(self, test_queries):
        all_results = []
        
        for query_info in test_queries:
            query = query_info['query']
            relevant_ids = set(query_info['relevant_docs'])
            
            # Recherche avec top_n assez grand pour capter tous les possibles pertinents
            results = self.engine.search(query, top_n=10)
            
            if results.empty:
                continue
                
            # Récupérer les vrais IDs des documents dans les résultats
            result_ids = results.index.tolist()  # ces IDs correspondent à df.index
            
            for rank, doc_id in enumerate(result_ids):
                is_relevant = 1 if doc_id in relevant_ids else 0
                score = results.loc[results.index == doc_id, 'score'].iloc[0] if 'score' in results.columns else 0
                
                all_results.append({
                    'query': query,
                    'doc_id': doc_id,
                    'rank': rank + 1,
                    'score': score,
                    'y_true': is_relevant
                })
        
        return pd.DataFrame(all_results)
    
    def find_best_threshold(self, eval_df):
        if eval_df.empty or len(eval_df['score'].unique()) < 2:
            return 0.0
            
        scores = sorted(eval_df['score'].unique(), reverse=True)
        thresholds = [0] + scores[:-1]  # tester juste en dessous de chaque score
        
        best_f1 = 0
        best_thresh = 0
        
        for thresh in thresholds:
            y_pred = (eval_df['score'] >= thresh).astype(int)
            f1 = f1_score(eval_df['y_true'], y_pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh
                
        return best_thresh
    
    def evaluate(self, test_queries, top_k=10):
        eval_df = self.create_evaluation_dataset(test_queries)
        
        if eval_df.empty:
            print("Aucun résultat retourné par le moteur → évaluation impossible")
            return {'precision@10': 0.0, 'recall@10': 0.0, 'f1@10': 0.0, 'mrr': 0.0, 'map': 0.0}
        
        # === Métriques classiques à top@10 (plus pertinent pour la recherche) ===
        top_results = eval_df[eval_df['rank'] <= top_k].copy()
        
        precision_at_k = top_results['y_true'].mean() if len(top_results) > 0 else 0.0
        recall_at_k = top_results['y_true'].sum() / max(1, eval_df['y_true'].sum())
        
        # MRR (Mean Reciprocal Rank)
        mrr = 0.0
        map_score = 0.0
        num_queries_with_relevant = 0
        
        for query in eval_df['query'].unique():
            q_df = eval_df[eval_df['query'] == query]
            relevant_ranks = q_df[q_df['y_true'] == 1]['rank']
            if len(relevant_ranks) > 0:
                num_queries_with_relevant += 1
                first_rank = relevant_ranks.min()
                mrr += 1.0 / first_rank
                
                # AP pour cette requête
                ap = 0.0
                relevant_found = 0
                for k in range(1, min(top_k, len(q_df)) + 1):
                    if q_df.iloc[k-1]['y_true'] == 1:
                        relevant_found += 1
                        ap += relevant_found / k
                if relevant_found > 0:
                    ap /= relevant_found
                map_score += ap
        
        mrr = mrr / max(1, num_queries_with_relevant)
        map_score = map_score / max(1, num_queries_with_relevant)
        
        f1_at_k = 2 * precision_at_k * recall_at_k / max(1e-9, precision_at_k + recall_at_k)
        
        return {
            f'precision@{top_k}': round(precision_at_k, 4),
            f'recall@{top_k}'   : round(recall_at_k, 4),
            f'f1@{top_k}'       : round(f1_at_k, 4),
            'mrr'               : round(mrr, 4),
            'map'               : round(map_score, 4),
            'total_queries'     : len(test_queries),
            'queries_with_results': eval_df['query'].nunique()
        }


# ============ FONCTION D'AIDE POUR TROUVER LES DOCS PERTINENTS ============
def find_relevant_docs(df, **criteria):
    df_reset = df.reset_index()
    mask = pd.Series([True] * len(df_reset))
    
    if 'title' in criteria:
        mask &= df_reset['Title'].str.contains(criteria['title'], case=False, na=False)
    if 'director' in criteria:
        mask &= df_reset['Director'].str.contains(criteria['director'], case=False, na=False)
    if 'genre' in criteria:
        mask &= df_reset['Genres'].str.contains(criteria['genre'], case=False, na=False)
    if 'year' in criteria:
        mask &= df_reset['Release_Date'].str.contains(str(criteria['year']), na=False)
    if 'exact_title' in criteria:
        mask &= df_reset['Title'].str.lower() == criteria['exact_title'].lower()
    
    return df_reset[mask]['id'].tolist()  # retourne les vrais IDs

In [14]:
# Après avoir chargé ton df et ton engine
# df = load_movies_from_json_folder(...) ou pd.read_csv(...)
# engine = SmartSearchEngine(df, load_from_file=...) ou construit

test_queries = [
    {'query': 'christopher nolan interstellar',           'relevant_docs': find_relevant_docs(df, director='nolan', title='interstellar')},
    {'query': 'tarantino pulp fiction',                   'relevant_docs': find_relevant_docs(df, director='tarantino', title='pulp fiction')},
    {'query': 'action movies christopher nolan 2010',     'relevant_docs': find_relevant_docs(df, director='nolan', year=2010)},
    {'query': 'spielberg adventure movie',                'relevant_docs': find_relevant_docs(df, director='spielberg', genre='adventure')[:20]},
    {'query': 'horror movies 2019',                       'relevant_docs': find_relevant_docs(df, genre='horror', year=2019)},
    {'query': 'pixar animation',                          'relevant_docs': find_relevant_docs(df, genre='animation', title='pixar')[:15]},
    {'query': 'star wars episode',                        'relevant_docs': find_relevant_docs(df, title='star wars')},
    {'query': 'batman dark knight',                       'relevant_docs': find_relevant_docs(df, title='dark knight')},
    {'query': 'scorsese dicaprio crime',                  'relevant_docs': find_relevant_docs(df, director='scorsese', genre='crime')},
    {'query': 'romantic comedy 2008',                     'relevant_docs': find_relevant_docs(df, genre='comedy', year=2008)[:10]},
]

evaluator = SearchEngineEvaluator(engine, df)
metrics = evaluator.evaluate(test_queries, top_k=10)

print("\n" + "="*70)
print("            RÉSULTATS D'ÉVALUATION DU MOTEUR DE RECHERCHE")
print("="*70)
for k, v in metrics.items():
    print(f"{k:20}: {v}")
print("="*70)


=== Classification ===
Director: ['christopher', 'nolan']
Title   : ['interstellar']

=== Classification ===
Director: ['tarantino']
Genre   : ['fiction']
Title   : ['pulp']

=== Classification ===
Director: ['christopher', 'nolan']
Genre   : ['action']
Year    : ['2010']
General : ['movie']

=== Classification ===
Genre   : ['adventure']
General : ['spielberg', 'movie']

=== Classification ===
Year    : ['2019']
General : ['horror', 'movie']

=== Classification ===
Genre   : ['animation']
General : ['pixar']

=== Classification ===
Genre   : ['war']
Title   : ['star']
General : ['episode']

=== Classification ===
Title   : ['batman', 'dark', 'knight']

=== Classification ===
Director: ['scorsese']
Genre   : ['crime']
General : ['dicaprio']

=== Classification ===
Genre   : ['comedy']
Year    : ['2008']
General : ['romantic']

            RÉSULTATS D'ÉVALUATION DU MOTEUR DE RECHERCHE
precision@10        : 0.0164
recall@10           : 1.0
f1@10               : 0.0323
mrr               