In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
import math
import spacy

In [2]:
class SmartSearchEngine:
    def __init__(self, df):
        self.df = df
        self.inverted_index = defaultdict(lambda: defaultdict(list))
        self.doc_lengths = {}
        self.avg_doc_length = {}
        self.N = len(df)
        
        # Load spaCy model with lemmatization
        print("Loading spaCy model...")
        self.nlp = spacy.load("en_core_web_sm")
        
        # Définir les champs à indexer
        self.fields = ['Title', 'Director', 'Genres', 'Overview', 'Release_Date']
        
        # Créer des dictionnaires pour reconnaissance automatique
        self.directors_set = set()
        self.genres_set = set()
        self.title_words = set()
        self.years_set = set()
        
        self.build_index()
        self.build_recognition_dicts()
    
    def preprocess_text(self, text, is_date=False):
        """Nettoyage et tokenisation avec spaCy + lemmatization"""
        if pd.isna(text):
            return []
        text = str(text).lower()
        
        # Pour les dates, extraire l'année (format: YYYY-MM-DD ou juste YYYY)
        if is_date:
            year_match = re.findall(r'\b(?:19|20)\d{2}\b', text)
            return year_match if year_match else []
        
        # Process with spaCy
        doc = self.nlp(text)
        
        # Extract lemmas with spaCy's built-in stopwords
        tokens = [
            token.lemma_ 
            for token in doc 
            if not token.is_stop          # spaCy's built-in stopwords (326 words)
            and not token.is_punct        # Remove punctuation
            and not token.is_space        # Remove whitespace
            and len(token.lemma_) > 2     # Remove short tokens
            and token.is_alpha            # Keep only alphabetic tokens
        ]
        
        return tokens
    
    def build_index(self):
        """Construction de l'index inversé par champ"""
        print("Construction de l'index inversé...")
        
        for field in self.fields:
            self.doc_lengths[field] = {}
            self.avg_doc_length[field] = 0
        
        for idx, row in self.df.iterrows():
            for field in self.fields:
                # Traitement spécial pour les dates
                is_date_field = (field == 'Release_Date')
                tokens = self.preprocess_text(row[field], is_date=is_date_field)
                self.doc_lengths[field][idx] = len(tokens)
                
                term_freq = defaultdict(int)
                for token in tokens:
                    term_freq[token] += 1
                
                for term, freq in term_freq.items():
                    self.inverted_index[field][term].append((idx, freq))
        
        for field in self.fields:
            if self.doc_lengths[field]:
                self.avg_doc_length[field] = np.mean(list(self.doc_lengths[field].values()))
        
        print(f"Index construit : {len(self.df)} documents indexés")
    
    def build_recognition_dicts(self):
        """Construire des dictionnaires pour reconnaître automatiquement les termes"""
        print("Construction des dictionnaires de reconnaissance...")
        
        # Extraire tous les réalisateurs
        for director in self.df['Director'].dropna().unique():
            tokens = self.preprocess_text(director)
            self.directors_set.update(tokens)
        
        # Extraire tous les genres
        for genres in self.df['Genres'].dropna():
            for genre in str(genres).split(','):
                tokens = self.preprocess_text(genre.strip())
                self.genres_set.update(tokens)
        
        # Extraire mots importants des titres
        for title in self.df['Title'].dropna():
            tokens = self.preprocess_text(title)
            self.title_words.update(tokens)
        
        # Extraire toutes les années des dates de sortie
        for date in self.df['Release_Date'].dropna():
            years = self.preprocess_text(date, is_date=True)
            self.years_set.update(years)
        
        print(f"Réalisateurs uniques: {len(self.directors_set)}")
        print(f"Genres uniques: {len(self.genres_set)}")
        print(f"Années disponibles: {len(self.years_set)}")
    
    def classify_query_terms(self, query_tokens):
        """Classifier automatiquement chaque terme de la requête"""
        classified = {
            'director': [],
            'genre': [],
            'title': [],
            'year': [],
            'general': []
        }
        
        for term in query_tokens:
            # Vérifier si c'est une année (4 chiffres commençant par 19 ou 20)
            if re.match(r'^(?:19|20)\d{2}$', term):
                classified['year'].append(term)
            # Vérifier dans quel champ le terme apparaît le plus
            elif term in self.directors_set:
                classified['director'].append(term)
            elif term in self.genres_set:
                classified['genre'].append(term)
            elif term in self.title_words:
                classified['title'].append(term)
            else:
                # Terme général, chercher partout
                classified['general'].append(term)
        
        return classified
    
    def bm25_score(self, term, doc_id, field, k1=1.5, b=0.75):
        """Calcul du score BM25 pour un terme dans un document"""
        if term not in self.inverted_index[field]:
            return 0.0
        
        tf = 0
        for doc, freq in self.inverted_index[field][term]:
            if doc == doc_id:
                tf = freq
                break
        
        if tf == 0:
            return 0.0
        
        df = len(self.inverted_index[field][term])
        idf = math.log((self.N - df + 0.5) / (df + 0.5) + 1.0)
        
        doc_len = self.doc_lengths[field].get(doc_id, 0)
        avg_len = self.avg_doc_length[field]
        
        if avg_len == 0:
            return 0.0
        
        norm = 1 - b + b * (doc_len / avg_len)
        score = idf * (tf * (k1 + 1)) / (tf + k1 * norm)
        
        return score
    
    def search(self, query, top_n=10):
        """Recherche intelligente avec classification automatique des termes"""
        # Extraire les années AVANT le preprocessing
        years_in_query = re.findall(r'\b(?:19|20)\d{2}\b', query)
        
        # Tokeniser la requête avec spaCy + lemmatization
        query_tokens = self.preprocess_text(query)
        
        # Ajouter les années extraites aux tokens
        query_tokens.extend(years_in_query)
        
        if not query_tokens:
            return pd.DataFrame()
        
        # Classifier les termes de la requête
        classified = self.classify_query_terms(query_tokens)
        
        # Debug: afficher la classification
        print(f"\n=== Classification des termes ===")
        for category, terms in classified.items():
            if terms:
                print(f"{category.capitalize()}: {terms}")
        
        # Collecter tous les documents candidats
        candidate_docs = set()
        
        # Chercher les termes dans leurs champs correspondants
        field_mapping = {
            'director': ['Director'],
            'genre': ['Genres'],
            'title': ['Title'],
            'year': ['Release_Date'],
            'general': ['Title', 'Director', 'Genres', 'Overview']
        }
        
        for category, terms in classified.items():
            target_fields = field_mapping[category]
            for term in terms:
                for field in target_fields:
                    if term in self.inverted_index[field]:
                        for doc_id, _ in self.inverted_index[field][term]:
                            candidate_docs.add(doc_id)
        
        # Calculer les scores pour chaque document
        scores = {}
        for doc_id in candidate_docs:
            total_score = 0.0
            
            # Score pour les termes de réalisateur
            for term in classified['director']:
                score = self.bm25_score(term, doc_id, 'Director')
                total_score += score * 1.0
            
            # Score pour les termes de genre
            for term in classified['genre']:
                score = self.bm25_score(term, doc_id, 'Genres')
                total_score += score * 1.0
            
            # Score pour les termes de titre
            for term in classified['title']:
                score = self.bm25_score(term, doc_id, 'Title')
                total_score += score * 1.0
            
            # Score pour les années
            for term in classified['year']:
                score = self.bm25_score(term, doc_id, 'Release_Date')
                total_score += score * 1.0
            
            # Score pour les termes généraux
            for term in classified['general']:
                for field in ['Title', 'Director', 'Genres', 'Overview', 'Release_Date']:
                    score = self.bm25_score(term, doc_id, field)
                    total_score += score * 1.0
            
            scores[doc_id] = total_score
        
        # Trier par score décroissant
        sorted_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
        
        if not sorted_docs:
            return pd.DataFrame()
        
        result_indices = [doc_id for doc_id, _ in sorted_docs]
        result_scores = [score for _, score in sorted_docs]
        
        results = self.df.loc[result_indices, ['Title', 'Overview', 'Genres', 'Director', 'Release_Date']].copy()
        results['score'] = result_scores
        
        return results.reset_index(drop=True)

In [3]:
# ============ UTILISATION AVEC VRAIES DONNÉES ============

# Charger les données
df = pd.read_csv("../data/cleaned_movies.csv")

print(f"Dataset chargé : {len(df)} films")
print(f"Colonnes disponibles : {df.columns.tolist()}\n")

# Créer le moteur de recherche
engine = SmartSearchEngine(df)



Dataset chargé : 4771 films
Colonnes disponibles : ['Title', 'Overview', 'Tagline', 'Homepage', 'Release_Date', 'Vote_Average', 'Runtime', 'Poster_Path', 'Genres', 'Keywords', 'Director', 'budget', 'revenue', 'production_companies', 'Cast']

Loading spaCy model...
Construction de l'index inversé...
Index construit : 4771 documents indexés
Construction des dictionnaires de reconnaissance...
Réalisateurs uniques: 2840
Genres uniques: 21
Années disponibles: 92


In [8]:
# ============ TESTS ============
results = engine.search("action movies of christopher nolan 2010", top_n=10)
print(results[['Title' , 'Director','Genres', 'Release_Date', 'score']])




=== Classification des termes ===
Director: ['christopher', 'nolan']
Genre: ['action', 'movie']
Year: ['2010']
                                Title               Director  \
0                           Inception      Christopher Nolan   
1                       Batman Begins      Christopher Nolan   
2                     The Dark Knight      Christopher Nolan   
3               The Dark Knight Rises      Christopher Nolan   
4                        Interstellar      Christopher Nolan   
5                        The Prestige      Christopher Nolan   
6                             Memento      Christopher Nolan   
7                            Insomnia      Christopher Nolan   
8                      Christmas Mail         John Murlowski   
9  Mission: Impossible - Rogue Nation  Christopher McQuarrie   

                               Genres Release_Date      score  
0  Action, Science Fiction, Adventure   2010-07-15  15.756930  
1                Action, Crime, Drama   2005-06-10  12.

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

class SearchEngineEvaluator:
    """
    Évaluation simple du moteur de recherche
    Retourne juste les scores finaux
    """
    
    def __init__(self, search_engine, df):
        self.engine = search_engine
        self.df = df
    
    def create_evaluation_dataset(self, test_queries):
        """Créer le dataset d'évaluation"""
        all_results = []
        
        for query_info in test_queries:
            query = query_info['query']
            relevant_docs = set(query_info['relevant_docs'])
            
            results = self.engine.search(query, top_n=10)
            
            if results.empty:
                continue
            
            for idx in results.index:
                row = results.loc[idx]
                all_results.append({
                    'query': query,
                    'doc_id': idx,
                    'score': row['score'],
                    'y_true': 1 if idx in relevant_docs else 0
                })
        
        return pd.DataFrame(all_results)
    
    def find_best_threshold(self, eval_df):
        """Trouver le meilleur seuil pour maximiser F1"""
        scores = eval_df['score'].values
        thresholds = np.linspace(scores.min(), scores.max(), 10)
        
        best_f1 = 0
        best_threshold = 0
        
        for threshold in thresholds:
            y_pred = (eval_df['score'] >= threshold).astype(int)
            f1 = f1_score(eval_df['y_true'], y_pred, zero_division=0)
            
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        
        return best_threshold
    
    def evaluate(self, test_queries):
        """
        Évaluation complète - Retourne juste les scores
        
        Returns:
            dict: {'accuracy': 0.95, 'precision': 0.92, 'recall': 0.88, 'f1_score': 0.90}
        """
        # Créer le dataset
        eval_df = self.create_evaluation_dataset(test_queries)
        
        if eval_df.empty:
            return {
                'accuracy': 0.0,
                'precision': 0.0,
                'recall': 0.0,
                'f1_score': 0.0,
                'error': 'Aucun résultat trouvé'
            }
        
        # Trouver le meilleur seuil
        best_threshold = self.find_best_threshold(eval_df)
        
        # Prédictions avec le meilleur seuil
        eval_df['y_pred'] = (eval_df['score'] >= best_threshold).astype(int)
        
        y_true = eval_df['y_true'].values
        y_pred = eval_df['y_pred'].values
        
        # Calculer les métriques
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1_score': f1_score(y_true, y_pred, zero_division=0),
            'threshold': best_threshold
        }
        
        return metrics


# ============ HELPER POUR TROUVER LES DOCS PERTINENTS ============

def find_relevant_docs(df, **criteria):
    """Trouve les documents pertinents selon des critères"""
    mask = pd.Series([True] * len(df), index=df.index)
    
    if 'title' in criteria:
        mask &= df['Title'].str.contains(criteria['title'], case=False, na=False)
    if 'director' in criteria:
        mask &= df['Director'].str.contains(criteria['director'], case=False, na=False)
    if 'genre' in criteria:
        mask &= df['Genres'].str.contains(criteria['genre'], case=False, na=False)
    if 'year' in criteria:
        mask &= df['Release_Date'].str.contains(str(criteria['year']), na=False)
    
    return df[mask].index.tolist()


# ============ EXEMPLE D'UTILISATION ============

# 1. Charger les données
df = pd.read_csv("../data/cleaned_movies.csv")
engine = SmartSearchEngine(df)

# 2. Créer les requêtes de test
test_queries = [
    {
        'query': 'tarantino action',
        'relevant_docs': find_relevant_docs(df, director='tarantino', genre='action')
    },
    {
        'query': 'nolan 2010',
        'relevant_docs': find_relevant_docs(df, director='nolan', year=2010)
    },
    {
        'query': 'spielberg adventure',
        'relevant_docs': find_relevant_docs(df, director='spielberg', genre='adventure')
    },
    {
        'query': 'horror 2019',
        'relevant_docs': find_relevant_docs(df, genre='horror', year=2019)
    },
    {
        'query': 'animation',
        'relevant_docs': find_relevant_docs(df, genre='animation')[:20]
    },
    {
        'query': 'scorsese crime',
        'relevant_docs': find_relevant_docs(df, director='scorsese', genre='crime')
    },
    {
        'query': 'star wars',
        'relevant_docs': find_relevant_docs(df, title='star wars')
    },
    {
        'query': 'batman',
        'relevant_docs': find_relevant_docs(df, title='batman')
    },
    {
        'query': 'comedy 2015',
        'relevant_docs': find_relevant_docs(df, genre='comedy', year=2015)
    },
    {
        'query': 'pixar',
        'relevant_docs': find_relevant_docs(df, title='pixar')[:15]
    }
]

# 3. Évaluer et afficher les scores
evaluator = SearchEngineEvaluator(engine, df)
scores = evaluator.evaluate(test_queries)

# 4. Afficher les résultats
print("\n" + "="*60)
print("RÉSULTATS DE L'ÉVALUATION")
print("="*60)
print(f"Accuracy:  {scores['accuracy']*100:.2f}%")
print(f"Precision: {scores['precision']*100:.2f}%")
print(f"Recall:    {scores['recall']*100:.2f}%")
print(f"F1-Score:  {scores['f1_score']*100:.2f}%")
print("="*60)

Loading spaCy model...
Construction de l'index inversé...
Index construit : 4771 documents indexés
Construction des dictionnaires de reconnaissance...
Réalisateurs uniques: 2840
Genres uniques: 21
Années disponibles: 92

=== Classification des termes ===
Director: ['tarantino']
Genre: ['action']

=== Classification des termes ===
Director: ['nolan']
Year: ['2010']

=== Classification des termes ===
Director: ['spielberg']
Genre: ['adventure']

=== Classification des termes ===
Genre: ['horror']
Year: ['2019']

=== Classification des termes ===
Genre: ['animation']

=== Classification des termes ===
Director: ['scorsese']
Genre: ['crime']

=== Classification des termes ===
Genre: ['war']
Title: ['star']

=== Classification des termes ===
Title: ['batman']

=== Classification des termes ===
Genre: ['comedy']
Year: ['2015']

=== Classification des termes ===
General: ['pixar']

RÉSULTATS DE L'ÉVALUATION
Accuracy:  100.00%
Precision: 100.00%
Recall:    100.00%
F1-Score:  100.00%
