In [None]:
!pip install emoji



In [None]:
!pip install -q arabic-reshaper

In [None]:
!pip install -q python-bidi

In [None]:
!pip install -q langid

In [None]:
!pip install -q python-Levenshtein

In [None]:
!pip install -q googletrans==4.0.0rc1

In [None]:
!pip install -q google-transliteration-api

In [None]:
import re
import pandas as pd
import emoji
import arabic_reshaper
from bidi.algorithm import get_display
import unicodedata
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import langid
import Levenshtein as lev
from collections import Counter
from googletrans import Translator
from google.transliteration import transliterate_word

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df= pd.read_excel('AlgD_Toxicity_Speech_Dataset.xlsx')
df = df[['comment', 'Hate speech']]
df

Unnamed: 0,comment,Hate speech
0,اعلام ضد الجزائر و ضد الشعب الجزائري.,no
1,تمثيل و سيناريو رديء جدا و عنوان خادع,yes
2,خصني قهوة بشيح \nاقعد صحيح لا طيح 😂😂😂😂,no
3,دفاع عن النفس فرض واجب على المستعضفين,no
4,فالاخير طفرت غير فالشعب الماضي انتهاء,no
...,...,...
14145,سمحيلي دقيقة معليش تقوليلي شعال راكي دايرة من ...,yes
14146,اسمه فركوس ، اتقوا الله في نفسكم ، كلنا مسلمون...,no
14147,شوف هذاك مستوى نتاع الصحافة الوزير يسجل هدفا م...,no
14148,اعلام الكذب\nالعالم برمته يضبطكم وخصوصا حين نق...,yes


#Data cleaning


In [None]:
# Define Arabic stop words
arabic_stopwords = set(stopwords.words('arabic'))
# Define English stop words
english_stopwords = set(stopwords.words('english'))
# Define french stop words
french_stopwords = set(stopwords.words('french'))

def normalize_arabic(text):
    # Define Arabic normalization rules
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_diacritics(text):
    # Remove Arabic diacritics
    arabic_diacritics = re.compile("""
                                 ّ    | # Tashdid
                                 َ    | # Fatha
                                 ً    | # Tanwin Fath
                                 ُ    | # Damma
                                 ٌ    | # Tanwin Damm
                                 ِ    | # Kasra
                                 ٍ    | # Tanwin Kasr
                                 ْ    | # Sukun
                                 ـ     # Tatwil
                             """, re.VERBOSE)
    text = re.sub(arabic_diacritics, '', text)
    return text

def remove_repeated_characters(text):
    # Remove characters that repeat more than twice
    return re.sub(r'(.)\1{2,}', r'\1', text)

def clean_text(text):
    # Replace emoji with empty string
    text = emoji.replace_emoji(text, replace='')

    # Normalize Arabic characters
    text = normalize_arabic(text)

    # Remove Arabic diacritics
    text = remove_diacritics(text)

    # Remove digits and punctuation
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove stop words
    words = text.split()
    words = [word for word in words if word not in arabic_stopwords and word not in english_stopwords and word not in french_stopwords]
    text = ' '.join(words)

    # Remove repeated characters
    text = remove_repeated_characters(text)

    # Remove extra spaces and newline characters
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
df['comment'] = df['comment'].apply(normalize_arabic)
df['comment'] = df['comment'].apply(remove_diacritics)
df['comment'] = df['comment'].apply(remove_repeated_characters)
df['comment'] = df['comment'].apply(clean_text)
df

Unnamed: 0,comment,Hate speech
0,اعلام ضد الجزاير ضد الشعب الجزايري,no
1,تمثيل سيناريو رديء جدا عنوان خادع,yes
2,خصني قهوه بشيح اقعد صحيح طيح,no
3,دفاع النفس فرض واجب علي المستعضفين,no
4,فالاخير طفرت فالشعب الماضي انتهاء,no
...,...,...
14145,سمحيلي دقيقه معليش تقوليلي شعال راكي دايره كيل...,yes
14146,اسمه فركوس اتقوا الله نفسكم كلنا مسلمون اكابر ...,no
14147,شوف هذاك مستوي نتاع الصحافه الوزير يسجل هدفا م...,no
14148,اعلام الكذب العالم برمته يضبطكم وخصوصا نقل اعل...,yes


#Language detection

In [None]:
def categorize_comments(comment):
    lang, _ = langid.classify(comment)

    if lang == 'ar':
        # Check if it contains only Arabic letters (excluding punctuation and numbers)
        if re.match(r'^[\u0621-\u064A\s]+$', comment):
            return 'arabic'
        else:
            return 'arabizi'
    elif lang in ['fr', 'en']:
        return 'french_english'
    else:
        return 'other'

df['category'] = df['comment'].apply(categorize_comments)

# Split the DataFrame into separate DataFrames based on categories
df_arabic = df[df['category'] == 'arabic']
df_french_english = df[df['category'] == 'french_english']
df_arabizi = df[df['category'] == 'arabizi']

# Reset index for each DataFrame
df_arabic = df_arabic.reset_index(drop=True)
df_french_english = df_french_english.reset_index(drop=True)
df_arabizi = df_arabizi.reset_index(drop=True)



#Abreviation processing

In [None]:
dictionnaire_normalisation = {
    "slm": "salam",
    "mrc": "merci",
    "svp": "s'il vous plaît",
    "nn": "non",
    "blk": "balek",
    "dz": "Algérie",
    "pdp": "photo de profil",
    "rdv": "rendez-vous",
    "jtm": "je t'aime",
    "wsh": "wach",
    "b1": "bien",
    "bcp": "beaucoup",
    "stp": "s'il te plaît",
    "mdr": "mort de rire",
    "bn8" :"bonne nuit",
    "pk": "pourquoi",
    "bsr": "bonsoir",
    "dsl": "désolé",
    "vrm": "vraiment",
    "cmnt": "commentaire",
    "cmntr":"commentaire",
    "inchallah": "incha'Allah",
    "hmdlh": "hamdoulillah",
    "tjrs": "toujours",
    "tjr":"toujours",
    "msg": "message",
    "a+": "à plus",
    "cc": "coucou",
    "dacc": "d'accord",
    "tkt": "t'inquiète",
    "qd": "quand",
}
for key, value in dictionnaire_normalisation.items():
    df_arabizi['comment'] = df_arabizi['comment'].str.replace(key, value)

for key, value in dictionnaire_normalisation.items():
    df_french_english['comment'] = df_french_english['comment'].str.replace(key, value)


#Phonetic encoding and word spelling

In [None]:
def soundex(word):
    word = word.upper()
    soundex_code = word[0]

    soundex_table = {
        "BFPV": "1", "CGJKQSXZ": "2", "DT": "3",
        "L": "4", "MN": "5", "R": "6", "AEIOUHWY": "."
    }

    for char in word[1:]:
        for key in soundex_table:
            if char in key:
                code = soundex_table[key]
                if code != soundex_code[-1]:  # Avoid duplicates
                    soundex_code += code

    soundex_code = soundex_code.replace(".", "")
    soundex_code = (soundex_code + "0000")[:4]

    return soundex_code

# Function to extract words from sentences
def extract_words(sentences):
    words = []
    for sentence in sentences:
        words.extend(re.findall(r'\b\w+\b', sentence))
    return words

# Function to filter words with only Latin characters
def is_latin(word):
    return bool(re.match(r'^[A-Za-z]+$', word))

# Extract words from sentences
all_words = extract_words(df_arabizi['comment'])
latin_words = [word for word in all_words if is_latin(word)]

# Create DataFrame for Latin words
words_df = pd.DataFrame({'words': latin_words})
words_df['soundex'] = words_df['words'].apply(soundex)

# Function to group words using Levenshtein distance within a threshold
def group_words(df, threshold=4):
    groups = []
    for _, group in df.groupby('soundex'):
        words = list(group['words'])
        grouped = set()
        for i, word1 in enumerate(words):
            if word1 in grouped:
                continue
            temp_group = [word1]
            for word2 in words[i+1:]:
                if word2 not in grouped and lev.distance(word1, word2) <= threshold:
                    temp_group.append(word2)
                    grouped.add(word2)
            groups.append(temp_group)
            grouped.add(word1)
    return groups

# Group words using the defined function
word_groups = group_words(words_df)

# Function to determine the replacement word for each group
def get_replacement_word(group):
    word_counts = Counter(group)
    most_common = word_counts.most_common()
    max_count = most_common[0][1]
    candidates = [word for word, count in most_common if count == max_count]
    return min(candidates)  # Return the first word alphabetically

# Create a mapping from words to their replacement word
replacement_map = {}
for group in word_groups:
    replacement_word = get_replacement_word(group)
    for word in group:
        replacement_map[word] = replacement_word

# Function to replace words in a sentence using the replacement map
def replace_words_in_sentence(sentence, replacement_map):
    return ' '.join(replacement_map.get(word, word) for word in re.findall(r'\b\w+\b', sentence))

# Replace words in the original DataFrame
df_arabizi['comment'] = df_arabizi['comment'].apply(lambda x: replace_words_in_sentence(x, replacement_map))



In [None]:
df_arabizi

Unnamed: 0,comment,Hate speech,category
0,قهوه بشيح ڨعد صحيح لطيح مقودين هه,no,arabizi
1,نريد امثال هاذ الحثاله فالقنوات انت عنوان لاحت...,yes,arabizi
2,ندعمو الصحراء الغربيه ودزو معاهم نتوما دعمتو ا...,yes,arabizi
3,مكاش سيف وڨالك احكي علي شيما الله يرحمها حاجه ...,no,arabizi
4,ريحا تعدم الشعب الجزايري وجه الشر tfo ya bil f,yes,arabizi
...,...,...,...
679,بدون دوله مدنيه _ دوله القانون الديمقراطيه الح...,no,arabizi
680,عايرتي قولتيهم كامل ياك وكي طلبتي سماح قلتي دي...,yes,arabizi
681,الصندوق اكثر يزعج الاقليه العلمانيه فرنسا اكبر...,yes,arabizi
682,رداءه القناه تفاهه الضيفه الغير المحترمه الغلط...,yes,arabizi


#Translation and transliteration


In [None]:
translator = Translator()

# Function to translate text
def translate_text(text, src_lang='auto', dest_lang='ar'):
    try:
        translation = translator.translate(text, src=src_lang, dest=dest_lang)
        return translation.text
    except Exception as e:
        return text  # Return the original text if translation fails

# Function to transliterate Arabic text
def transliterate_text(arabic_text):
    try:
        text = transliterate_word(arabic_text, lang_code='ar')
        if text:
            return text[0]  # Return the first transliteration
        else:
            return arabic_text  # Return original text if no transliteration found
    except Exception as e:
        return arabic_text

# Function to translate and then transliterate text
def translate_and_transliterate(text):
    translated_text = translate_text(text)
    transliterated_text = transliterate_text(translated_text)
    return transliterated_text

df_arabizi['comment'] = df_arabizi['comment'].apply(translate_and_transliterate)
df_french_english['comment'] = df_french_english['comment'].apply(translate_and_transliterate)
df_arabizi

Request failed with status code: 200
ERROR: ["FAILED_TO_PARSE_REQUEST_BODY"]


Unnamed: 0,comment,Hate speech,category
0,قهوه بشيح ڨعد صحيح لطيح مقودين هه,no,arabizi
1,نريد امثال هاذ الحثاله فالقنوات انت عنوان لاحت...,yes,arabizi
2,ندعمو الصحراء الغربيه ودزو معاهم نتوما دعمتو ا...,yes,arabizi
3,مكاش سيف وڨالك احكي علي شيما الله يرحمها حاجه ...,no,arabizi
4,ريحا تعدم الشعب الجزايري وجه الشر تف يا بال ف,yes,arabizi
...,...,...,...
679,بدون دوله مدنيه دوله القانون الديمقراطيه الحق...,no,arabizi
680,عايرتي قولتيهم كامل ياك وكي طلبتي سماح قلتي دي...,yes,arabizi
681,الصندوق اكثر يزعج الاقليه العلمانيه فرنسا اكبر...,yes,arabizi
682,رداءه القناه تفاهه الضيفه الغير المحترمه الغلط...,yes,arabizi


In [None]:
df_french_english

Unnamed: 0,comment,Hate speech,category
0,بف حتى واحد هوس فتش هادي عند عند إنكمنتص برودك...,no,french_english
1,ادعى الجميع قرآن تلاوة,no,french_english
2,سيصوت الناس لذاك يقولون إن للجéري سيصوت لأول م...,yes,french_english
3,أه جود فم فم امبéكل نصت فريس لفريق سود دéفلبéإ...,no,french_english
4,بعد سنوات من عدم وجودها دائمًا لعدم وجود فرنسا...,no,french_english
...,...,...,...
372,بف دوك هادي نحطها فب تنيك لينةéرêط,yes,french_english
373,هاهاها فئة فريمنة ريري بوليتلجل فية رéبéتر كوم...,no,french_english
374,وا الله أدوم لم ير السياسيون الجزائريون المزيد...,yes,french_english
375,بلال ديفيس إنفت ست مول بلاك بلص ستوديوس دونك ب...,no,french_english
