In [1]:
pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.26.1 rapidfuzz-3.11.0


In [2]:
import re
import numpy as np
from typing import List, Dict, Optional, Tuple
from Levenshtein import distance as levenshtein_distance
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer

class TweetFilter:
    def __init__(self, brand_keywords: List[str], topic_keywords: List[str],
                 translate_func=None, distance_threshold: float = 0.8,
                 semantic_threshold: float = 0.6,
                 model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialise le filtre de tweets avec calculs de distance et embeddings

        Args:
            brand_keywords: Liste de mots-clés liés à la marque
            topic_keywords: Liste de mots-clés liés au thème
            translate_func: Fonction pour traduire du darija vers l'anglais
            distance_threshold: Seuil de similarité lexicale (0-1)
            semantic_threshold: Seuil de similarité sémantique (0-1)
            model_name: Nom du modèle SentenceTransformer à utiliser
        """
        self.brand_keywords = [k.lower() for k in brand_keywords]
        self.topic_keywords = [k.lower() for k in topic_keywords]
        self.translate_func = translate_func
        self.distance_threshold = distance_threshold
        self.semantic_threshold = semantic_threshold

        # Initialiser le modèle d'embeddings
        self.model = SentenceTransformer(model_name)

        # Pré-calculer les embeddings des thèmes
        self.topic_embeddings = self.model.encode(self.topic_keywords)

    def cosine_similarity(self, v1: np.ndarray, v2: np.ndarray) -> float:
        """Calcule la similarité cosinus entre deux vecteurs"""
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

    def calculate_lexical_similarity(self, word1: str, word2: str) -> float:
        """Calcule la similarité lexicale entre deux mots"""
        word1, word2 = word1.lower(), word2.lower()

        max_len = max(len(word1), len(word2))
        if max_len == 0:
            return 0

        lev_similarity = 1 - (levenshtein_distance(word1, word2) / max_len)
        seq_similarity = SequenceMatcher(None, word1, word2).ratio()

        return (lev_similarity + seq_similarity) / 2

    def find_similar_words(self, text: str, keywords: List[str]) -> List[tuple]:
        """Trouve les mots similaires lexicalement"""
        words = text.lower().split()
        similar_words = []

        for word in words:
            for keyword in keywords:
                similarity = self.calculate_lexical_similarity(word, keyword)
                if similarity >= self.distance_threshold:
                    similar_words.append((word, keyword, similarity))

        return similar_words

    def calculate_semantic_similarity(self, text: str) -> List[Tuple[str, float]]:
        """
        Calcule la similarité sémantique entre le texte et les thèmes

        Returns:
            Liste de tuples (thème, score de similarité)
        """
        # Obtenir l'embedding du texte
        text_embedding = self.model.encode(text)

        # Calculer la similarité avec chaque thème
        similarities = []
        for i, topic_emb in enumerate(self.topic_embeddings):
            similarity = self.cosine_similarity(text_embedding, topic_emb)
            if similarity >= self.semantic_threshold:
                similarities.append((self.topic_keywords[i], float(similarity)))

        return sorted(similarities, key=lambda x: x[1], reverse=True)

    def is_spam(self, text: str) -> bool:
        """Détecte si un tweet est du spam"""
        spam_indicators = [
            r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            r'(@\w+\s*){3,}',
            r'(#\w+\s*){3,}',
            r'(.)\1{4,}',
        ]

        for pattern in spam_indicators:
            if re.search(pattern, text):
                return True
        return False

    def contains_brand_mention(self, text: str) -> tuple:
        """Vérifie les mentions de marque avec similarité lexicale"""
        if self.translate_func:
            text = self.translate_func(text)

        similar_words = self.find_similar_words(text, self.brand_keywords)
        return bool(similar_words), similar_words

    def analyze_topic_relevance(self, text: str) -> tuple:
        """
        Analyse la pertinence thématique avec similarité lexicale et sémantique

        Returns:
            (bool, dict): (pertinent, résultats_analyse)
        """
        if self.translate_func:
            text = self.translate_func(text)

        # Similarité lexicale
        lexical_matches = self.find_similar_words(text, self.topic_keywords)

        # Similarité sémantique
        semantic_matches = self.calculate_semantic_similarity(text)

        is_relevant = bool(lexical_matches or semantic_matches)

        return is_relevant, {
            'lexical_matches': lexical_matches,
            'semantic_matches': semantic_matches
        }

    def filter_tweet(self, tweet: Dict) -> Optional[Dict]:
        """Filtre et enrichit un tweet avec l'analyse complète"""
        text = tweet['text']

        if self.is_spam(text):
            return None

        has_brand, brand_matches = self.contains_brand_mention(text)
        is_topic_relevant, topic_analysis = self.analyze_topic_relevance(text)

        if has_brand or is_topic_relevant:
            enriched_tweet = tweet.copy()
            enriched_tweet['analysis'] = {
                'brand_matches': [
                    {
                        'found_word': w,
                        'matched_keyword': k,
                        'similarity_score': round(s, 3)
                    }
                    for w, k, s in brand_matches
                ],
                'topic_analysis': {
                    'lexical_matches': [
                        {
                            'found_word': w,
                            'matched_keyword': k,
                            'similarity_score': round(s, 3)
                        }
                        for w, k, s in topic_analysis['lexical_matches']
                    ],
                    'semantic_matches': [
                        {
                            'topic': topic,
                            'semantic_similarity': round(score, 3)
                        }
                        for topic, score in topic_analysis['semantic_matches']
                    ]
                }
            }
            return enriched_tweet

        return None

    def filter_tweets(self, tweets: List[Dict]) -> List[Dict]:
        """Filtre une liste de tweets"""
        return [t for t in tweets if self.filter_tweet(t) is not None]

## Traduction


## Arabizi Arabic

In [3]:
def arabizi_to_arabic(text):
    # Dictionnaire pour les combinaisons complexes
    complex_transliteration_map = {
        "ch": "ش",
        "kh": "خ",
        "gh": "غ",
        "sh": "ش",
        "th": "ث",
        "dh": "ذ"
    }

    # Dictionnaire pour les caractères individuels
    transliteration_map = {
        "9": "ق",
        "3": "ع",
        "7": "ح",
        "5": "خ",
        "6": "ط",
        "2": "ء",
        "8": "غ",
        "1": "ا",
        "0": "و",
        "a": "ا", "b": "ب", "c": "ك", "d": "د", "e": "ي", "f": "ف",
        "g": "ج", "h": "ه", "i": "ي", "j": "ج", "k": "ك", "l": "ل",
        "m": "م", "n": "ن", "o": "و", "p": "ب", "q": "ق", "r": "ر",
        "s": "س", "t": "ت", "u": "و", "v": "ف", "w": "و", "x": "كس",
        "y": "ي", "z": "ز"
    }

    # Prétraitement pour gérer les combinaisons complexes
    for combo, arabic_char in complex_transliteration_map.items():
        text = text.replace(combo, arabic_char)

    # Conversion des caractères restants
    result = ""
    for char in text:
        if char in transliteration_map:
            result += transliteration_map[char]
        else:
            result += char  # Garder les caractères non reconnus tels quels

    return result

# Exemple d'utilisation
text_arabizi = "3jbny had sak mn zara"
text_arabic = arabizi_to_arabic(text_arabizi)
print("Texte en arabe :", text_arabic)


Texte en arabe : عجبني هاد ساك من زارا


In [4]:
def arabic_to_arabizi(text):
    # Dictionnaire de translittération arabe → Arabizi
    transliteration_map = {
        "ق": "9", "ع": "3", "ح": "7", "خ": "kh", "ط": "6", "ء": "2", "غ": "gh", "و": "ou",
        "ا": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j", "د": "d", "ذ": "dh", "ر": "r",
        "ز": "z", "س": "s", "ش": "ch", "ص": "s", "ض": "d", "ط": "t", "ظ": "z", "ك": "k",
        "ل": "l", "م": "m", "ن": "n", "ه": "h", "ي": "y", "ف": "f", "ج": "j" , "ة" : "a"
    }

    result = ""
    for char in text:
        if char in transliteration_map:
            result += transliteration_map[char]
        else:
            result += char  # Garder les caractères non reconnus tels quels

    return result

# Exemple d'utilisation
text_arabic = "عجبني هاد صاك من زارا"
text_arabizi = arabic_to_arabizi(text_arabic)
print("Texte en Arabizi :", text_arabizi)


Texte en Arabizi : 3jbny had sak mn zara


In [5]:
arabic_to_arabizi("عجبني هاد المنتوج ولكن الكلتي   عيانة")

'3jbny had almntouj oulkn alklty   3yana'

## Correction


In [6]:
! pip install DarijaDistance

Collecting DarijaDistance
  Downloading DarijaDistance-1.0.8-py3-none-any.whl.metadata (7.1 kB)
Downloading DarijaDistance-1.0.8-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: DarijaDistance
Successfully installed DarijaDistance-1.0.8


In [7]:
from DarijaDistance.word_distance import WordDistance

In [8]:
wd = WordDistance()
distance = wd.distance_between("marjan", "mrjane")
print(f"Distance: {distance}")
# Distance: 0

Distance: 3.2


In [9]:
closest_words, min_distance = wd.get_closests("kulb")
print(f"Closest words to 'kulb': {closest_words} - min distance = {min_distance}")

Closest words to 'kulb': ['klb', 'kelb', 'kalb'] - min distance = 2.1


In [10]:
pip install Levenshtein



In [11]:
from typing import List, Dict, Tuple
from Levenshtein import distance as levenshtein_distance
from difflib import SequenceMatcher

def levenshtein_similarity(word1: str, word2: str) -> float:
    """
    Calcule la similarité basée sur la distance de Levenshtein
    Adaptée pour les fautes de frappe et substitutions
    """
    word1, word2 = word1.lower(), word2.lower()
    max_len = max(len(word1), len(word2))
    if max_len == 0:
        return 0
    distance = levenshtein_distance(word1, word2)
    return 1 - (distance / max_len)

def sequence_similarity(word1: str, word2: str) -> float:
    """
    Calcule la similarité de séquence
    Meilleure pour détecter les parties communes
    """
    return SequenceMatcher(None, word1.lower(), word2.lower()).ratio()

def phonetic_similarity(word1: str, word2: str) -> float:
    """
    Calcule une similarité phonétique simple
    Utile pour les erreurs phonétiques courantes
    """
    # Dictionnaire de remplacement pour les sons similaires
    replacements = {
        'a': 'a', 'e': 'a', 'é': 'a', 'è': 'a', 'ê': 'a',
        'i': 'i', 'y': 'i',
        'o': 'o', 'u': 'o',
        'k': 'q', 'c': 'q',
        'z': 's',
        'f': 'v',
        'b': 'p',
        't': 'd',
        'n': 'm'
    }

    # Simplifie les mots en remplaçant les caractères similaires
    def simplify(word: str) -> str:
        return ''.join(replacements.get(c, c) for c in word.lower())

    simple1 = simplify(word1)
    simple2 = simplify(word2)
    return sequence_similarity(simple1, simple2)

def calculate_brand_similarity(word: str, brand: str) -> Dict:
    """
    Calcule tous les scores de similarité entre un mot et une marque
    """
    lev_score = levenshtein_similarity(word, brand)
    seq_score = sequence_similarity(word, brand)
    phon_score = phonetic_similarity(word, brand)

    # Score composite (moyenne des trois scores)
    composite_score = (lev_score + seq_score + phon_score) / 3

    return {
        'found_word': word,
        'matched_brand': brand,
        'composite_score': round(composite_score, 3),
        'details': {
            'levenshtein_score': round(lev_score, 3),
            'sequence_score': round(seq_score, 3),
            'phonetic_score': round(phon_score, 3)
        }
    }

def find_similar_brands(text: str, brands: List[str], threshold: float = 0.7) -> List[Dict]:
    """
    Trouve toutes les marques similaires dans un texte

    Args:
        text: Texte à analyser
        brands: Liste des marques correctes
        threshold: Seuil minimum de similarité (0-1)
    """
    words = text.lower().split()
    matches = []

    for word in words:
        for brand in brands:
            similarity = calculate_brand_similarity(word, brand)
            if similarity['composite_score'] >= threshold:
                matches.append(similarity)

    return sorted(matches, key=lambda x: x['composite_score'], reverse=True)

def correct_brand_names(text: str, brands: List[str], threshold: float = 0.7) -> Tuple[str, List[Dict]]:
    """
    Corrige les noms de marques dans un texte

    Returns:
        (texte_corrigé, liste_corrections)
    """
    words = text.split()
    corrections = []

    for i, word in enumerate(words):
        matches = find_similar_brands(word, brands, threshold)
        if matches:
            best_match = matches[0]
            if best_match['composite_score'] >= threshold:
                original = words[i]
                words[i] = best_match['matched_brand']
                corrections.append({
                    'original': original,
                    'corrected': best_match['matched_brand'],
                    'position': i,
                    'scores': best_match
                })

    return ' '.join(words), corrections

In [12]:
# Liste des marques correctes
brands = ["marjane", "Carrefour", "Atacadao", "Quality", "Marwa", "Keito", "Iphone"]

# Exemple 1: Trouver les marques similaires dans un texte
text = "J'ai acheter ce telephone de alklty"
matches = find_similar_brands(text, brands, threshold=0.7)

for match in matches:
    print(f"Trouvé '{match['found_word']}' -> '{match['matched_brand']}'")
    print(f"Score: {match['composite_score']}")
    print("Détails:", match['details'])

# Exemple 2: Corriger automatiquement le texte
corrected_text, corrections = correct_brand_names(text, brands, threshold=0.7)
print(f"\nTexte original : {text}")
print(f"Texte corrigé : {corrected_text}")


Texte original : J'ai acheter ce telephone de alklty
Texte corrigé : J'ai acheter ce telephone de alklty


In [13]:
text = "ana mchit t9edit 7wayji mn kto"
matches = find_similar_brands(text, brands, threshold=0.7)

for match in matches:
    print(f"Trouvé '{match['found_word']}' -> '{match['matched_brand']}'")
    print(f"Score: {match['composite_score']}")
    print("Détails:", match['details'])

Trouvé 'kto' -> 'Keito'
Score: 0.7
Détails: {'levenshtein_score': 0.6, 'sequence_score': 0.75, 'phonetic_score': 0.75}


In [14]:
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Tuple

def load_embedding_model(model_name: str = 'all-MiniLM-L6-v2') -> SentenceTransformer:
    """
    Charge le modèle d'embeddings
    """
    return SentenceTransformer(model_name)

def get_embeddings(texts: List[str], model: SentenceTransformer) -> np.ndarray:
    """
    Calcule les embeddings pour une liste de textes
    """
    return model.encode(texts)

def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    """
    Calcule la similarité cosinus entre deux vecteurs
    """
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def calculate_theme_similarity(phrase: str, themes: List[str],
                             model: SentenceTransformer) -> List[Dict[str, float]]:
    """
    Calcule la similarité entre une phrase et plusieurs thèmes

    Args:
        phrase: La phrase à analyser
        themes: Liste des thèmes à comparer
        model: Modèle d'embeddings chargé

    Returns:
        Liste de dictionnaires contenant le thème et son score de similarité
    """
    # Calculer l'embedding de la phrase
    phrase_embedding = get_embeddings([phrase], model)[0]

    # Calculer les embeddings des thèmes
    theme_embeddings = get_embeddings(themes, model)

    # Calculer les similarités
    similarities = []
    for i, theme in enumerate(themes):
        score = cosine_similarity(phrase_embedding, theme_embeddings[i])
        similarities.append({
            'theme': theme,
            'similarity_score': float(score)
        })

    # Trier par score décroissant
    return sorted(similarities, key=lambda x: x['similarity_score'], reverse=True)

# Exemple d'utilisation
if __name__ == "__main__":
    # Charger le modèle
    model = load_embedding_model()

    # Définir quelques thèmes en anglais
    themes = [
        "sports and athletics",
        "technology and computers",
        "food and cooking",
        "music and entertainment"
    ]

    # Exemples de phrases à tester
    test_phrases = [
        "I love playing football with my friends every weekend",
        "The basketball match was really exciting",
        "I need to buy a new laptop for work",
        "This recipe for chocolate cake is amazing"
    ]

    # Tester chaque phrase
    for phrase in test_phrases:
        print(f"\nPhrase: {phrase}")
        similarities = calculate_theme_similarity(phrase, themes, model)

        print("Similarités thématiques:")
        for sim in similarities:
            print(f"- {sim['theme']}: {sim['similarity_score']:.3f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Phrase: I love playing football with my friends every weekend
Similarités thématiques:
- sports and athletics: 0.348
- food and cooking: 0.175
- music and entertainment: 0.158
- technology and computers: -0.047

Phrase: The basketball match was really exciting
Similarités thématiques:
- sports and athletics: 0.346
- music and entertainment: 0.205
- food and cooking: 0.150
- technology and computers: 0.112

Phrase: I need to buy a new laptop for work
Similarités thématiques:
- technology and computers: 0.221
- food and cooking: 0.046
- music and entertainment: 0.019
- sports and athletics: -0.061

Phrase: This recipe for chocolate cake is amazing
Similarités thématiques:
- food and cooking: 0.266
- music and entertainment: 0.047
- technology and computers: 0.013
- sports and athletics: -0.023


In [15]:
themes = [
        "clothes",
        "technology and computers",
        "food and cooking",
        "music and entertainment"
    ]

    # Exemples de phrases à tester
test_phrases = [
        "I love playing football with my friends every weekend",
        "I bought a new pair of jeans",
        "I love this hoodie",
        "I need to buy a new laptop for work",
        "This recipe for chocolate cake is amazing"
    ]

for phrase in test_phrases:
    print(f"\nPhrase: {phrase}")
    similarities = calculate_theme_similarity(phrase, themes, model)

    print("Similarités thématiques:")
    for sim in similarities:
        print(f"- {sim['theme']}: {sim['similarity_score']:.3f}")


Phrase: I love playing football with my friends every weekend
Similarités thématiques:
- food and cooking: 0.175
- music and entertainment: 0.158
- clothes: 0.038
- technology and computers: -0.047

Phrase: I bought a new pair of jeans
Similarités thématiques:
- clothes: 0.438
- technology and computers: 0.099
- food and cooking: 0.058
- music and entertainment: 0.044

Phrase: I love this hoodie
Similarités thématiques:
- clothes: 0.319
- food and cooking: 0.114
- music and entertainment: 0.078
- technology and computers: 0.017

Phrase: I need to buy a new laptop for work
Similarités thématiques:
- technology and computers: 0.221
- clothes: 0.111
- food and cooking: 0.046
- music and entertainment: 0.019

Phrase: This recipe for chocolate cake is amazing
Similarités thématiques:
- food and cooking: 0.266
- music and entertainment: 0.047
- technology and computers: 0.013
- clothes: 0.011


In [16]:
txtdarija="dak l pc li khdit mn azus sd9 zwin wlkn lbatri chouia 3iyana"

In [17]:
brands = ["Marjane", "Carrefour", "Atacadao", "asus", "Adidas", "Keito"]
matches = find_similar_brands(txtdarija, brands, threshold=0.7)
l1=[]
l2=[]
for match in matches:
    l1.append(match['found_word'])
    l2.append(match['matched_brand'])

In [18]:
matches

[{'found_word': 'azus',
  'matched_brand': 'asus',
  'composite_score': 0.833,
  'details': {'levenshtein_score': 0.75,
   'sequence_score': 0.75,
   'phonetic_score': 1.0}}]

In [19]:
l1

['azus']

In [20]:
l2

['asus']

In [21]:
def corriger(t):
  liste=t.split(" ")
  l=[]
  for i in range (len(liste)):
    if liste[i] not in l1:
      closest_words, min_distance = wd.get_closests(liste[i])
      if  min_distance <= 2 :
        l.append(closest_words[0])
      else : l.append(liste[i])
    else:
      l.append(l2[l1.index(liste[i])])
  return " ".join(l)

In [22]:
corr1=corriger(txtdarija)

In [23]:
corr1

'dak l pc li khdit mn asus sd9 zwin wlkn lbatri chouia 3iyana'

In [24]:
arabizi_to_arabic("dak l telefon li khdit mn l7anout sd9 zwin wlkn lbatri chouia 3iyana")

'داك ل تيليفون لي خديت من لحانووت سدق زوين ولكن لباتري شوويا عييانا'

In [25]:
text_arabic1 = arabizi_to_arabic(corr1.lower())

In [26]:
text_arabic1

'داك ل بك لي خديت من اسوس سدق زوين ولكن لباتري شوويا عييانا'

In [28]:
from typing import List, Dict, Tuple
import re
from Levenshtein import distance as levenshtein_distance

def normalize_arabic_french(word: str) -> str:
    """
    Normalise les caractéristiques communes entre darija et français
    """
    # Table de correspondance phonétique darija-français
    replacements = {
        'k': 'q',  # alklty -> aqlty
        '9': 'k',
        '7': 'h',
        '3': 'a',
        '2': 'a',
        '5': 'kh',
        '8': 'h',
        'aa': 'a',
        'ii': 'i',
        'oo': 'o',
        'ee': 'e',
        'lt': 't',   # spécifique pour qualité
        'ql': 'q',   # spécifique pour qualité
    }

    # Convertir en minuscules
    word = word.lower()

    # Appliquer les remplacements
    for old, new in replacements.items():
        word = word.replace(old, new)

    return word

def get_phonetic_key(word: str) -> str:
    """
    Crée une clé phonétique simplifiée
    """
    # Normaliser d'abord
    word = normalize_arabic_french(word)

    # Règles phonétiques supplémentaires
    rules = [
        (r'[aeiou]+', 'a'),  # Réduit les voyelles consécutives
        (r'([a-z])\1+', r'\1'),  # Réduit les consonnes répétées
        ('ph', 'f'),
        ('qu', 'k'),
        ('x', 'ks'),
    ]

    for pattern, replacement in rules:
        word = re.sub(pattern, replacement, word)

    return word

def find_best_match(darija_word: str, french_words: List[str], threshold: float = 0.6) -> Tuple[str, float]:
    """
    Trouve le mot français le plus proche du mot en darija
    """
    darija_normalized = normalize_arabic_french(darija_word)
    darija_phonetic = get_phonetic_key(darija_word)

    best_match = None
    best_score = 0

    for french_word in french_words:
        french_normalized = french_word.lower()
        french_phonetic = get_phonetic_key(french_word)

        # Calculer différents scores de similarité
        phonetic_score = 1 - (levenshtein_distance(darija_phonetic, french_phonetic) /
                             max(len(darija_phonetic), len(french_phonetic)))

        normalized_score = 1 - (levenshtein_distance(darija_normalized, french_normalized) /
                              max(len(darija_normalized), len(french_normalized)))

        # Score composite
        score = (phonetic_score * 0.7) + (normalized_score * 0.3)

        if score > best_score:
            best_score = score
            best_match = french_word

    return (best_match, best_score) if best_score >= threshold else (None, 0)

# Dictionnaire de correspondances connues darija-français
KNOWN_MAPPINGS = {
    "alklty": "qualité",
    "kwaliti": "qualité",
    "kaliti": "qualité",
    "kvaliti": "qualité",
    # Ajoutez d'autres correspondances connues ici
}

def convert_darija_to_french(word: str, french_words: List[str] = None) -> Dict:
    """
    Convertit un mot darija en français
    """
    # Vérifier d'abord dans les correspondances connues
    if word.lower() in KNOWN_MAPPINGS:
        return {
            'original': word,
            'converted': KNOWN_MAPPINGS[word.lower()],
            'confidence': 1.0,
            'method': 'known_mapping'
        }

    # Si pas de liste de mots français fournie, utiliser une liste par défaut
    if french_words is None:
        french_words = ["qualité", "quantité", "calcul", "calibre", "qualifié"]

    # Trouver la meilleure correspondance
    best_match, score = find_best_match(word, french_words)

    if best_match:
        return {
            'original': word,
            'converted': best_match,
            'confidence': round(score, 3),
            'method': 'phonetic_matching'
        }
    else:
        return {
            'original': word,
            'converted': None,
            'confidence': 0,
            'method': 'no_match'
        }

# Exemple d'utilisation
if __name__ == "__main__":
    # Test avec différentes variations
    test_words = ["alklty", "kwaliti", "kaliti", "kvaliti","qaliti"]

    for word in test_words:
        result = convert_darija_to_french(word)
        print(f"\nMot original: {result['original']}")
        print(f"Conversion: {result['converted']}")
        print(f"Confiance: {result['confidence']}")
        print(f"Méthode: {result['method']}")


Mot original: alklty
Conversion: qualité
Confiance: 1.0
Méthode: known_mapping

Mot original: kwaliti
Conversion: qualité
Confiance: 1.0
Méthode: known_mapping

Mot original: kaliti
Conversion: qualité
Confiance: 1.0
Méthode: known_mapping

Mot original: kvaliti
Conversion: qualité
Confiance: 1.0
Méthode: known_mapping

Mot original: qaliti
Conversion: qualité
Confiance: 0.798
Méthode: phonetic_matching


In [29]:
brands = ["Marjane", "Carrefour", "Atacadao", "asus", "Adidas", "Keito"]
matches = find_similar_brands(txtdarija, brands, threshold=0.7)
l1=[]
l2=[]
for match in matches:
    l1.append(match['found_word'])
    l2.append(match['matched_brand'])

In [30]:
def filterbrand(l,brands):
  v=[]
  nv=[]
  for i in l:
    matches = find_similar_brands(i[0], [brands], threshold=0.7)
    l1=[]
    for match in matches:
      l1.append(match['matched_brand'])
    if brands in l1:
      v.append(i)
    else:
      nv.append(i)
  return v,nv




In [31]:
def filterproduit(l,brands,feature):
  v=[]
  nv=[]
  for i in l:
    matches = find_similar_brands(i[0], [brands], threshold=0.7)
    l1=[]
    for match in matches:
      l1.append(match['matched_brand'])
    if brands in l1:
      v.append(i)
    else:
       matches = find_similar_brands(i[0], feature, threshold=0.7)
       l2=[]
       for match in matches:
        l2.append(match['matched_brand'])
        if l2!=[]:
          v.append(i)
        else:
          nv.append(i)
  return v,nv


In [32]:
# prompt: je veux une fonction tel que il prend un fichier csv et il le transforme en une liste une une chaque ligne et un element de la liste

import csv

def csv_to_list(file_path: str) -> list:
  """
  Reads a CSV file and converts it into a list of lists, where each inner list represents a row.

  Args:
    file_path: The path to the CSV file.

  Returns:
    A list of lists representing the CSV data, or None if an error occurs.
  """
  try:
    with open(file_path, 'r', encoding='utf-8') as file:
      reader = csv.reader(file)
      data = list(reader)
      return data
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    return None
  except Exception as e:
    print(f"An error occurred: {e}")
    return None

In [33]:
liste_des_tweet=csv_to_list("Dataset_Final_avec_Fautes__Ferrari_et_Mod_les_.csv")

In [34]:
liste_des_tweet

[['Tweet'],
 ['chi vidéo ghriba 3la Ferary t3lmt bzf haja jdida'],
 ['Ferary rah jdid b spécifications 3alya'],
 ['l design dyal 812 Superfest rah ghzal'],
 ['l design dyal Ferary rah wa3r bzf'],
 ["klash bzat mn Ferary m3 l'autre marque"],
 ['3jbtni Calefornia'],
 ['Fari wa7d mn a7sn voitures f l3alam'],
 ['Ferary rah jdid b spécifications 3alya'],
 ['lkhdma dyal Ferar f tech wa3ra'],
 ['chi vidéo ghriba 3la Ferary t3lmt bzf haja jdida'],
 ['chaf chi wa7d la voiture dyal Ferr lyouma?'],
 ['3jbtni 812 Superfest'],
 ['l design dyal Ferary rah wa3r bzf'],
 ['3jbtni Calefornia'],
 ['bghit nji nsem3 chi music calm lyouma'],
 ['l design dyal Ferary rah wa3r bzf'],
 ['lahla ysma7 lina mn lbirocracy f lbaladiya'],
 ["klash bzat mn Ferary m3 l'autre marque"],
 ['kan7lm nji njrb chi nhar 812 Superfest'],
 ['l design dyal Ferary rah wa3r bzf'],
 ['makan7lm ghir b Fery fchi nhar'],
 ['Fari kay3jbni bzaf mn lmodèles dyalhom'],
 ['3jbtni Triboto'],
 ['kan7lm nji njrb chi nhar Portofena'],
 ['chi vi

In [35]:
v,vn=filterbrand(liste_des_tweet,"Ferrari")

In [36]:
vn

[['Tweet'],
 ['l design dyal 812 Superfest rah ghzal'],
 ['3jbtni Calefornia'],
 ['Fari wa7d mn a7sn voitures f l3alam'],
 ['chaf chi wa7d la voiture dyal Ferr lyouma?'],
 ['3jbtni 812 Superfest'],
 ['3jbtni Calefornia'],
 ['bghit nji nsem3 chi music calm lyouma'],
 ['lahla ysma7 lina mn lbirocracy f lbaladiya'],
 ['kan7lm nji njrb chi nhar 812 Superfest'],
 ['makan7lm ghir b Fery fchi nhar'],
 ['Fari kay3jbni bzaf mn lmodèles dyalhom'],
 ['3jbtni Triboto'],
 ['kan7lm nji njrb chi nhar Portofena'],
 ['chaf chi wa7d la voiture dyal Ferr lyouma?'],
 ['kan7lm nji njrb chi nhar 488'],
 ['l design dyal Triboto rah ghzal'],
 ['3jbtni GTC4Luso'],
 ['Fari kay3jbni bzaf mn lmodèles dyalhom'],
 ['Fari kay3jbni bzaf mn lmodèles dyalhom'],
 ['n9adti chi hwayj lyoum, b9a ghada t3aya'],
 ['3jbtni SF99'],
 ['njarb l new model dyal Farri wahadi!'],
 ['l design dyal 812 Superfest rah ghzal'],
 ['Fari wa7d mn a7sn voitures f l3alam'],
 ['l design dyal Triboto rah ghzal'],
 ['3jbtni 488'],
 ['3jbtni LaFe

In [37]:
len(vn)

71

In [38]:
models = ["488", "812 Superfast", "Roma", "SF90", "Portofino", "LaFerrari", "F8 Tributo", "Monza", "California", "GTC4Lusso"]


In [39]:
v,nv=filterproduit(liste_des_tweet,"Ferrari",models)

In [40]:
len(liste_des_tweet)

101

In [41]:
len(v)

60

In [42]:
v

[['chi vidéo ghriba 3la Ferary t3lmt bzf haja jdida'],
 ['Ferary rah jdid b spécifications 3alya'],
 ['l design dyal 812 Superfest rah ghzal'],
 ['l design dyal Ferary rah wa3r bzf'],
 ["klash bzat mn Ferary m3 l'autre marque"],
 ['3jbtni Calefornia'],
 ['Ferary rah jdid b spécifications 3alya'],
 ['lkhdma dyal Ferar f tech wa3ra'],
 ['chi vidéo ghriba 3la Ferary t3lmt bzf haja jdida'],
 ['3jbtni 812 Superfest'],
 ['l design dyal Ferary rah wa3r bzf'],
 ['3jbtni Calefornia'],
 ['l design dyal Ferary rah wa3r bzf'],
 ["klash bzat mn Ferary m3 l'autre marque"],
 ['kan7lm nji njrb chi nhar 812 Superfest'],
 ['l design dyal Ferary rah wa3r bzf'],
 ['3jbtni Triboto'],
 ['kan7lm nji njrb chi nhar Portofena'],
 ['chi vidéo ghriba 3la Ferary t3lmt bzf haja jdida'],
 ['kan7lm nji njrb chi nhar 488'],
 ['chi vidéo ghriba 3la Ferary t3lmt bzf haja jdida'],
 ['lkhdma dyal Ferar f tech wa3ra'],
 ['l design dyal Triboto rah ghzal'],
 ['3jbtni GTC4Luso'],
 ['Ferary rah jdid b spécifications 3alya'],


In [43]:
def filterproduit(l,qualité,prix):
  lq=[]
  lp=[]
  lr=[]
  for i in l:
    matchesquality = find_similar_brands(i[0], qualité, threshold=0.7)
    matchesprix = find_similar_brands(i[0], prix, threshold=0.8)
    l1=[]
    l2=[]
    for match in matchesquality:
      l1.append(match['matched_brand'])
    if l1!=[]:
      lq.append(i)
    for match in matchesprix:
      l2.append(match['matched_brand'])
    if l2!=[]:
      lp.append(i)
    if l1==[]and l2==[]: lr.append(i)
  return lq,lp,lr


In [44]:
qualité=["qualité, quality, kalité","jawda","ljawda","lqualité","lqualiti"]

In [45]:
prix=['taman','price',"pri","lprix","prix",'flous','derham','dh','dhs','dirham','ghali','rkhis']

In [46]:
listetweet=csv_to_list("Dataset_Final_en_Darija_sur_une_Marque_de_T_l_phone.csv")

In [47]:
lq,lp,lr=filterproduit(listetweet,qualité,prix)

In [48]:
lp

[['Samsung ghaly bzaaaf compared to other brands'],
 ['wach Samsongue worth lprix lli tay3tdo?'],
 ['Samsong 3ndo models ghlyin w price kayzid'],
 ['ma kay7ellich f prix dyal Samsoon'],
 ['Samsung ghaly bzaaaf compared to other brands'],
 ['l prix dyal Samsoon ghali shwiya'],
 ['price dyal Samson kayn chi had kaygol normal?'],
 ['l prix dyal Samsoon ghali shwiya'],
 ['wach Samsongue worth lprix lli tay3tdo?'],
 ['ma kay7ellich f prix dyal Samsoon'],
 ['wach Samsongue worth lprix lli tay3tdo?'],
 ['ma kay7ellich f prix dyal Samsoon'],
 ['price dyal Samson kayn chi had kaygol normal?'],
 ['Samsung ghaly bzaaaf compared to other brands'],
 ['Samsong 3ndo models ghlyin w price kayzid'],
 ['l prix dyal Samsoon ghali shwiya'],
 ['Samsong 3ndo models ghlyin w price kayzid'],
 ['Samson rah mzyan bs7 flus kay7erqo bzf'],
 ['price dyal Samson kayn chi had kaygol normal?'],
 ['Samson rah mzyan bs7 flus kay7erqo bzf'],
 ['Samson rah mzyan bs7 flus kay7erqo bzf']]

In [49]:
sentiment_positif=["3jbni", "7bit", "7ebit","nadi","zwin","wa3r","naadia","mfrge3a","ghzal"]

In [50]:
sentiment_negatif=["ma3jbnich","ma7meltouch","ma7ebitch","khayb","na9ess","7amed","fachel","probleme","mochkil","khasr","habta"]

In [51]:
def sentiment_analysis(l,sentimentpositif,sentimentnegatif):
  lq=[]
  lp=[]
  lr=[]
  for i in l:
    matchespossent = find_similar_brands(i, sentimentpositif, threshold=0.7)
    matchesnegsent = find_similar_brands(i, sentimentnegatif, threshold=0.8)
    l1=[]
    l2=[]
    for match in matchespossent:
      l1.append(match['matched_brand'])
    if l1!=[]:
      lq.append(i)
    for match in matchesnegsent:
      l2.append(match['matched_brand'])
    if l2!=[]:
      lp.append(i)
    if l1==[]and l2==[]: lr.append(i)
  return lq,lp,lr


In [52]:
# prompt: je veux une fonction qui lie le csv et rend une liste de la premiere colonne

import csv

def get_first_column(file_path):
  """Reads a CSV file and returns a list of the first column's values.

  Args:
    file_path: The path to the CSV file.

  Returns:
    A list containing the values from the first column, or None if an error occurs.
  """
  try:
    with open(file_path, 'r', encoding='utf-8') as file:
      reader = csv.reader(file)
      first_column_data = [row[0] for row in reader]  # Extract the first element of each row
      return first_column_data
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    return None
  except IndexError:
    print("Error: Some rows might be empty in the CSV file.")
    return None
  except Exception as e:
    print(f"An error occurred: {e}")
    return None

In [53]:
sent=get_first_column("Dataset_d_Analyse_de_Sentiments_sur_Ferrari.csv")

In [54]:
l=["3jbni had lmntouj","had lmntouj khaybe bzf","had telephone ma7mltoch","lpc zwin mais lqualité khayba"]

In [55]:
p,n,r=sentiment_analysis(l,sentiment_positif,sentiment_negatif)

In [229]:
sentiment_positif = {
    "3jbni": 0.7, "7bit": 0.9, "7ebit": 0.7, "nadi": 0.8, "zwin": 0.7,"zouin":0.7,"zine":0.7,
    "wa3r": 8.0, "naadia": 0.7, "mfrge3a": 8.0, "ghzal": 1.0 , "makhaybch": 0.4 ,"nadiya":0.8, "nadia":0.8 , "mana9ssach":0.4,"matay7ach":0.4
}

sentiment_negatif = {
    "ma3jbnich": -0.7, "ma7ebitch": -0.9, "khayb": -7.0, "na9ess": -0.7,"ma7ebitouch":-0.6,
    "7amed": -0.6, "fachel": -1.0, "probleme": -0.8, "mochkil": -0.9,
    "khasr": -0.7, "habta": -0.8, "ma7meltoch": -0.9 ,"tay7a" : -0.7 , "mazwinch" :-0.6,"mazouinch":-0.6 , "manadiyach":-0.6,"manadiach":-0.6,"mawa3rach":-0.4, "3iyan":-0.7,"frya":-0.7
}

In [57]:
def sentiment_analysis2(l, sentimentpositif, sentimentnegatif):
    lq = []  # Liste pour les tweets avec sentiments positifs
    lp = []  # Liste pour les tweets avec sentiments négatifs
    lr = []  # Liste pour les tweets neutres ou non classés

    for i in l:
        score = 0
        # Trouver les correspondances pour les mots positifs et négatifs
        matchespossent = find_similar_brands(i, list(sentimentpositif.keys()), threshold=0.7)
        matchesnegsent = find_similar_brands(i, list(sentimentnegatif.keys()), threshold=0.8)

        # Calculer le score basé sur les pondérations
        for match in matchespossent:
            matched_word = match['matched_brand']
            score += sentimentpositif.get(matched_word, 0)  # Ajouter le poids du mot positif

        for match in matchesnegsent:
            matched_word = match['matched_brand']
            score += sentimentnegatif.get(matched_word, 0)  # Soustraire le poids du mot négatif

        # Classifier le tweet selon le score
        if score > 0:
            lq.append((i, score))  # Sentiment positif
        elif score < 0:
            lp.append((i, score))  # Sentiment négatif
        else:
            lr.append((i, score))  # Sentiment neutre ou non classé

    return lq, lp, lr


In [58]:
p,n,r=sentiment_analysis2(l, sentiment_positif, sentiment_negatif)

In [59]:
p

[('3jbni had lmntouj', 0.7)]

In [60]:
def sentiment_analysis3(l, sentimentpositif, sentimentnegatif):
    lq = []  # Liste pour les tweets avec sentiments positifs
    lp = []  # Liste pour les tweets avec sentiments négatifs
    lr = []  # Liste pour les tweets neutres ou non classés

    negate_words = ["machi", "ma","mchi","maxi","machy"]  # Liste des mots qui inversent le sentiment

    for i in l:
        score = 0
        negate = False  # Indicateur d'inversion

        # Trouver les correspondances pour les mots positifs et négatifs
        matchespossent = find_similar_brands(i, list(sentimentpositif.keys()), threshold=0.7)
        matchesnegsent = find_similar_brands(i, list(sentimentnegatif.keys()), threshold=0.8)

        # Calculer le score basé sur les pondérations et inversions
        for match in matchespossent:
            matched_word = match['matched_brand']
            if any(negate_word in i.split() for negate_word in negate_words):  # Vérifier s'il y a un mot d'inversion
                score -= sentimentpositif.get(matched_word, 0)  # Inverser le score
            else:
                score += sentimentpositif.get(matched_word, 0)  # Ajouter le score positif

        for match in matchesnegsent:
            matched_word = match['matched_brand']
            if any(negate_word in i.split() for negate_word in negate_words):  # Vérifier s'il y a un mot d'inversion
                score -= sentimentnegatif.get(matched_word, 0)  # Inverser le score
            else:
                score += sentimentnegatif.get(matched_word, 0)  # Ajouter le score négatif

        # Classifier le tweet selon le score final
        if score > 0:
            lq.append((i, score))  # Sentiment positif
        elif score < 0:
            lp.append((i, score))  # Sentiment négatif
        else:
            lr.append((i, score))  # Sentiment neutre ou non classé

    return lq, lp, lr


In [61]:
l=["3jbni had lmntouj","had lmntouj khaybe bzf","had telephone ma7mltoch","lpc machi khayb","lferari machi khayba", "houa sara7a zwin wlkn lqualité khayba"]

In [62]:
positif,negatif,rien=sentiment_analysis3(l, sentiment_positif, sentiment_negatif)

In [63]:
positif

[('3jbni had lmntouj', 0.7),
 ('lpc machi khayb', 7.0),
 ('lferari machi khayba', 7.0)]

In [64]:
def sentiment_analysis4(l, sentimentpositif, sentimentnegatif):
    """
    Analyse les sentiments dans des textes darija avec gestion de 'wlkn' (mais)
    Utilise les fonctions existantes find_similar_brands
    """
    lq = []  # Liste pour les tweets avec sentiments positifs
    lp = []  # Liste pour les tweets avec sentiments négatifs
    lr = []  # Liste pour les tweets neutres ou non classés

    # Mots qui inversent le sentiment
    negate_words = ["machi", "ma", "mchi", "maxi", "machy"]

    # Variations de "wlkn" (mais)
    contrast_words = ["wlkn", "walakine", "walkin", "walakin", "welakine", "mais", "lakin"]

    for text in l:
        # Vérifier si le texte contient un mot de contraste
        words = text.split()
        contrast_index = -1
        for i, word in enumerate(words):
            if word.lower() in contrast_words:
                contrast_index = i
                break

        if contrast_index != -1:
            # Si "wlkn" est présent, séparer la phrase en deux parties
            before_contrast = " ".join(words[:contrast_index])
            after_contrast = " ".join(words[contrast_index + 1:])

            # Trouver les sentiments après "wlkn"
            matchespossent = find_similar_brands(after_contrast, list(sentimentpositif.keys()), threshold=0.7)
            matchesnegsent = find_similar_brands(after_contrast, list(sentimentnegatif.keys()), threshold=0.8)

            # Calculer le score uniquement pour la partie après "wlkn"
            score = 0
            for match in matchespossent:
                matched_word = match['matched_brand']
                if any(negate_word in after_contrast.split() for negate_word in negate_words):
                    score -= sentimentpositif.get(matched_word, 0)
                else:
                    score += sentimentpositif.get(matched_word, 0)

            for match in matchesnegsent:
                matched_word = match['matched_brand']
                if any(negate_word in after_contrast.split() for negate_word in negate_words):
                    score -= sentimentnegatif.get(matched_word, 0)
                else:
                    score += sentimentnegatif.get(matched_word, 0)

        else:
            # Si pas de "wlkn", analyser la phrase entière comme avant
            score = 0
            matchespossent = find_similar_brands(text, list(sentimentpositif.keys()), threshold=0.7)
            matchesnegsent = find_similar_brands(text, list(sentimentnegatif.keys()), threshold=0.8)

            for match in matchespossent:
                matched_word = match['matched_brand']
                if any(negate_word in text.split() for negate_word in negate_words):
                    score -= sentimentpositif.get(matched_word, 0)
                else:
                    score += sentimentpositif.get(matched_word, 0)

            for match in matchesnegsent:
                matched_word = match['matched_brand']
                if any(negate_word in text.split() for negate_word in negate_words):
                    score -= sentimentnegatif.get(matched_word, 0)
                else:
                    score += sentimentnegatif.get(matched_word, 0)

        # Classifier selon le score final
        if score > 0:
            lq.append((text, score))
        elif score < 0:
            lp.append((text, score))
        else:
            lr.append((text, score))

    return lq, lp, lr

In [87]:
def sentiment_analysis6(l, sentimentpositif, sentimentnegatif):
    """
    Analyse les sentiments dans des textes darija avec gestion avancée des nuances
    Paramètres:
        l: Liste des textes à analyser
        sentimentpositif: Dictionnaire des mots positifs et leurs scores
        sentimentnegatif: Dictionnaire des mots négatifs et leurs scores
    Retourne:
        lq: Liste des textes positifs avec leurs scores
        lp: Liste des textes négatifs avec leurs scores
        lr: Liste des textes neutres avec leurs scores
    """
    lq = []  # Liste pour les tweets positifs
    lp = []  # Liste pour les tweets négatifs
    lr = []  # Liste pour les tweets neutres

    # Mots qui inversent le sentiment avec leurs variations
    negate_words = {
        "machi": ["machi", "machy", "mchi", "mashi"],
        "ma": ["ma", "maxi", "maxay", "makay"],
        "la": ["la", "lay", "laa"]
    }

    # Mots de contraste avec leurs variations
    contrast_words = {
        "wlkn": ["wlkn", "walakine", "walkin", "walakin", "welakine"],
        "mais": ["mais", "maya", "mayo"],
        "lakin": ["lakin", "lakine", "laken"]
    }

    # Intensificateurs qui modifient le score
    intensifiers = {
        "bzf": 1.5,    # très
        "bzf": 1.5,      # très
        "bzaf": 1.5,     # variante
        "bzaaf": 1.5,    # variante
        "bzef": 1.5,     # variante
        "bezaf": 1.5,    # variante rare
        "bazaf": 1.5,    # variante alternative
        "bzff": 1.5   ,
        "ktir": 1.3,      # beaucoup
        "chwiya": 0.7,
        "chouia": 0.7,
        "chwia": 0.7,
        "choia": 0.7,# un peu
        "chwiya": 0.7,
        "kamel": 1.4,     # complètement
        "bla9iyass": 1.6,       # extrêmement
        "mout": 1.8       # mort de
    }

    def normalize_text(text):
        """Normalise le texte en gérant les variations d'écriture."""
        # Conversion des chiffres arabes
        number_map = {'3': '3', '7': '7', '9': '9', '2': '2', '5': '5', '8': '8'}
        normalized = ''.join(number_map.get(c, c) for c in text)
        return normalized.lower().strip()

    def is_negated(word, text_words):
        """Vérifie si un mot est sous l'influence d'une négation."""
        word_index = text_words.index(word) if word in text_words else -1
        if word_index == -1:
            return False

        # Vérifie les 3 mots précédents pour la négation
        for i in range(max(0, word_index - 3), word_index + 1):
            for neg_variations in negate_words.values():
                if text_words[i] in neg_variations:
                    return True
        return False

    def get_intensifier_multiplier(text_words):
        """Calcule le multiplicateur basé sur les intensificateurs présents."""
        multiplier = 1.0
        for word in text_words:
            if word in intensifiers:
                multiplier *= intensifiers[word]
        return multiplier

    for text in l:
        text = normalize_text(text)
        words = text.split()

        # Recherche des mots de contraste
        contrast_index = -1
        for i, word in enumerate(words):
            for contrast_variations in contrast_words.values():
                if word in contrast_variations:
                    contrast_index = i
                    break
            if contrast_index != -1:
                break

        segments = []
        if contrast_index != -1:
            # Diviser le texte en segments avant/après le contraste
            before_contrast = " ".join(words[:contrast_index])
            after_contrast = " ".join(words[contrast_index + 1:])
            segments = [before_contrast, after_contrast]
        else:
            segments = [text]

        total_score = 0
        for segment in segments:
            segment_words = segment.split()
            segment_score = 0

            # Analyse des sentiments positifs
            matchespossent = find_similar_brands(segment, list(sentimentpositif.keys()), threshold=0.7)
            for match in matchespossent:
                matched_word = match['matched_brand']
                score = sentimentpositif.get(matched_word, 0)

                if is_negated(matched_word, segment_words):
                    score *= -1

                multiplier = get_intensifier_multiplier(segment_words)
                segment_score += score * multiplier

            # Analyse des sentiments négatifs
            matchesnegsent = find_similar_brands(segment, list(sentimentnegatif.keys()), threshold=0.7)
            for match in matchesnegsent:
                matched_word = match['matched_brand']
                score = sentimentnegatif.get(matched_word, 0)

                if is_negated(matched_word, segment_words):
                    score *= -1

                multiplier = get_intensifier_multiplier(segment_words)
                segment_score += score * multiplier

            # Si c'est le segment après un contraste, on lui donne plus de poids
            if len(segments) > 1 and segment == segments[1]:
                segment_score *= 1.2

            total_score += segment_score

        # Classification avec le contexte
        result_tuple = (text, total_score)
        if total_score > 0:
            lq.append(result_tuple)
        elif total_score < 0:
            lp.append(result_tuple)
        else:
            lr.append(result_tuple)

    # Tri des résultats par intensité du sentiment
    lq.sort(key=lambda x: x[1], reverse=True)
    lp.sort(key=lambda x: x[1])
    lr.sort(key=lambda x: abs(x[1]))

    return lq, lp, lr

In [192]:
def sentiment_analysis7(l, sentimentpositif, sentimentnegatif):
    """
    Analyse les sentiments dans des textes darija avec gestion des inversions, fautes d'orthographe,
    et intensificateurs.
    """
    lq = []  # Liste pour les tweets positifs
    lp = []  # Liste pour les tweets négatifs
    lr = []  # Liste pour les tweets neutres

    negate_words = ["machi", "machy", "mchi", "mashi", "ma", "maxi", "makay", "la", "lay", "laa"]

    intensifiers = {
        "bzf": 1.5, "bzaf": 1.5, "bzaaf": 1.5, "bzef": 1.5, "bezaf": 1.5,
        "bazaf": 1.5, "bzff": 1.5, "ktir": 1.3, "keteer": 1.3,
        "chwiya": 0.7, "chwia": 0.7, "chouia": 0.7, "choia": 0.7,
        "kamel": 1.4, "bla9iyass": 1.6, "bela9ias": 1.6, "mout": 1.8, "moot": 1.8
    }

    def is_negated(word_index, words):
        """Vérifie si un mot est sous l'influence d'une négation."""
        for i in range(max(0, word_index - 3), word_index):
            if words[i] in negate_words:
                return True
        return False

    def get_intensifier_multiplier(word_index, words):
        """Calcule le multiplicateur d'intensité basé sur les intensificateurs présents."""
        multiplier = 1.0
        for i in range(max(0, word_index - 2), word_index):
            matches = find_similar_brands(words[i], list(intensifiers.keys()), threshold=0.7)
            if matches:
                matched_word = matches[0]['matched_brand']
                multiplier *= intensifiers.get(matched_word, 1.0)
        return multiplier

    for text in l:
        words = text.split()
        score = 0

        # Parcourir chaque mot pour détecter les sentiments et appliquer les intensificateurs
        for word_index, word in enumerate(words):
            # Recherche des sentiments positifs
            matchespossent = find_similar_brands(word, list(sentimentpositif.keys()), threshold=0.7)
            for match in matchespossent:
                matched_word = match['matched_brand']
                word_score = sentimentpositif.get(matched_word, 0)
                multiplier = get_intensifier_multiplier(word_index, words)  # Appliquer l'intensificateur
                if is_negated(word_index, words):  # Si le mot est précédé par un mot de négation
                    word_score *= -1
                score += word_score * multiplier  # Multiplier le score par l'intensificateur

            # Recherche des sentiments négatifs
            matchesnegsent = find_similar_brands(word, list(sentimentnegatif.keys()), threshold=0.8)
            for match in matchesnegsent:
                matched_word = match['matched_brand']
                word_score = sentimentnegatif.get(matched_word, 0)
                multiplier = get_intensifier_multiplier(word_index, words)  # Appliquer l'intensificateur
                if is_negated(word_index, words):  # Si le mot est précédé par un mot de négation
                    word_score *= -1
                score += word_score * multiplier  # Multiplier le score par l'intensificateur

        # Classification selon le score
        if score > 0:
            lq.append((text, score))
        elif score < 0:
            lp.append((text, score))
        else:
            lr.append((text, score))

    return lq, lp, lr


In [268]:
def sentiment_analysis8(l, sentimentpositif, sentimentnegatif):
    """
    Analyse les sentiments dans des textes darija avec gestion des inversions, fautes d'orthographe,
    et intensificateurs.
    """
    lq = []  # Liste pour les tweets positifs
    lp = []  # Liste pour les tweets négatifs
    lr = []  # Liste pour les tweets neutres

    negate_words = ["machi", "machy", "mchi", "mashi", "ma", "maxi", "makay", "la", "lay", "laa"]

    intensifiers = {
        "bzf": 1.5, "bzaf": 1.5, "bzaaf": 1.5, "bzef": 1.5, "bezaf": 1.5,
        "bazaf": 1.5, "bzff": 1.5, "ktir": 1.3, "keteer": 1.3,
        "chwiya": 0.7, "chwia": 0.7, "chouia": 0.7, "choia": 0.7,
        "kamel": 1.4, "bla9iyass": 1.6, "bela9ias": 1.6, "mout": 1.8, "moot": 1.8
    }

    def is_negated(word_index, words):
        """Vérifie si un mot est sous l'influence d'une négation."""
        for i in range(max(0, word_index - 3), word_index):
            if words[i] in negate_words:
                return True
        return False

    def get_intensifier_multiplier(words, sentiment_index):
        """
        Calcule le multiplicateur d'intensité en cherchant les intensificateurs après le mot de sentiment.
        """
        multiplier = 1.0
        # Chercher dans les 2 mots suivant le mot de sentiment
        for i in range(sentiment_index + 1, min(sentiment_index + 3, len(words))):
            matches = find_similar_brands(words[i], list(intensifiers.keys()), threshold=0.7)
            if matches:
                matched_intensifier = matches[0]['matched_brand']
                multiplier *= intensifiers[matched_intensifier]
        return multiplier

    for text in l:
        if not text.strip():  # Ignorer les textes vides
            continue

        words = text.split()
        score = 0

        # Parcourir chaque mot
        for word_index, word in enumerate(words):
            # Vérifier la négation
            is_neg = is_negated(word_index, words)

            # Vérifier les sentiments positifs
            matches_pos = find_similar_brands(word, list(sentimentpositif.keys()), threshold=0.7)
            for match in matches_pos:
                matched_word = match['matched_brand']
                # Obtenir le multiplicateur en vérifiant les mots qui suivent
                multiplier = get_intensifier_multiplier(words, word_index)
                word_score = sentimentpositif[matched_word] * multiplier
                score += -word_score if is_neg else word_score

            # Vérifier les sentiments négatifs
            matches_neg = find_similar_brands(word, list(sentimentnegatif.keys()), threshold=0.8)
            for match in matches_neg:
                matched_word = match['matched_brand']
                # Obtenir le multiplicateur en vérifiant les mots qui suivent
                multiplier = get_intensifier_multiplier(words, word_index)
                word_score = sentimentnegatif[matched_word] * multiplier
                score += -word_score if is_neg else word_score

        # Classification selon le score
        if score > 0:
            lq.append((text, score))
        elif score < 0:
            lp.append((text, score))
        else:
            lr.append((text, score))

    return lq, lp, lr

In [66]:
l=["3jbni had lmntouj","had lmntouj khaybe bzf","had telephone ma7mltoch","lpc machi khayb","lferari machi khayba", "houa sara7a zwin wlkn lqualité khayba","hoa ah l liphone zwin mais l ecran dialo khayba ","lp khayb bzf"]

In [258]:
l1=["mourad machi zine", "miaw, benkiran 7amed wlkn zina", "ana zwin","ana zwin bezf"]

In [260]:
positif,negatif,rien=sentiment_analysis8(l1, sentiment_positif, sentiment_negatif)

In [261]:
negatif

[('mourad machi zine', -0.7)]

In [157]:
find_similar_brands("", sentiment_positif, threshold=0.7)

[{'found_word': '7bitou',
  'matched_brand': '7bit',
  'composite_score': 0.756,
  'details': {'levenshtein_score': 0.667,
   'sequence_score': 0.8,
   'phonetic_score': 0.8}}]

In [116]:
sentiment_positif

{'3jbni': 0.7,
 '7bit': 0.9,
 '7ebit': 0.7,
 'nadi': 0.8,
 'zwin': 0.7,
 'wa3r': 8.0,
 'naadia': 0.7,
 'mfrge3a': 8.0,
 'ghzal': 1.0,
 'makhaybch': 0.4,
 'nadiya': 0.8,
 'nadia': 0.8,
 'mana9ssach': 0.4,
 'matay7ach': 0.4}

In [262]:
l1=["had lmntouj machy zwin wlkn zwin","lpc zwin bezaf","lpc zwin","lpc zwin bzf","tacos 3iyan","l iphone zwin wlkn lqualité khayba"]

In [266]:
positif,negatif,rien=sentiment_analysis8(l1, sentiment_positif, sentiment_negatif)

In [267]:
negatif

[('had lmntouj machy zwin wlkn zwin', -1.4),
 ('tacos 3iyan', -0.7),
 ('l iphone zwin wlkn lqualité khayba', -6.3)]

In [224]:
find_similar_brands("3iyan", sentiment_negatif, threshold=0.5)

[]