In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URLs
urls = [
    "https://www.aljazeera.net",  # Exemple d'un site d'actualités en arabe
    "https://www.alriyadh.com",  # Exemple d'un site saoudien
]

# Initialize an empty list to store data
data = []

# Loop through each URL
for url in urls:
    try:
        # Temporarily bypass SSL verification
        response = requests.get(url, verify=False)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract paragraphs (you can modify this based on your needs)
            articles = soup.find_all('p')  # Example: Extract paragraphs
            for article in articles:
                text = article.get_text(strip=True)
                if text:  # Ensure the text is not empty
                    data.append(text)
        else:
            print(f"Failed to fetch {url}: HTTP {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

# Save the data to a DataFrame
df = pd.DataFrame(data, columns=['Text'])

# Export the dataset to a CSV file
df.to_csv("arabic_dataset.csv", index=False, encoding='utf-8-sig')

print("Dataset saved as arabic_dataset.csv")




Dataset saved as arabic_dataset.csv


In [6]:
import pandas as pd

# Charger les données depuis le fichier CSV
df = pd.read_csv("arabic_dataset.csv")

# Afficher les premières lignes pour vérifier les données
print(df.head())


                                                Text
0  لم يكن علماء السلف على مذهب فقهي واحد في معامل...
1  تناولت حلقة (2024/12/3) من برنامج ” الاتجاه ال...
2  أفاد تحقيق لهآرتس، بأن موجة معاداة السامية الت...
3  اتهم مدير مستشفى كمال عدوان بشمال غزة الجيش ال...
4  قالت لوموند إن الحكومة السورية تدفع ثمن رفضها ...


In [19]:
import nltk

# Téléchargez les ressources nécessaires
nltk.download('punkt')        # Cela téléchargera les modèles de tokenisation de base
nltk.download('stopwords')    # Téléchargez les stopwords
nltk.download('punkt_tab')    # Téléchargez le modèle de tokenisation pour d'autres langues

# Vérifiez si le téléchargement est effectué correctement
print("Téléchargement réussi !")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...


Téléchargement réussi !


[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [20]:
import nltk
from nltk.corpus import stopwords
import string

# Exemple de texte arabe
text = "الذكاء الاصطناعي هو مجال في علوم الكمبيوتر يهتم بتطوير الأنظمة الذكية."

# Tokenisation manuelle
tokens = text.split()

# Charger la liste des stopwords en arabe
nltk.download('stopwords')
stop_words = set(stopwords.words('arabic'))

# Filtrer les tokens en supprimant les stopwords
filtered_tokens = [word for word in tokens if word not in stop_words]
print("Tokens sans stopwords:", filtered_tokens)

# Fonction pour supprimer la ponctuation
def remove_punctuation(tokens):
    return [word for word in tokens if word not in string.punctuation]

# Supprimer la ponctuation des tokens
cleaned_tokens = remove_punctuation(filtered_tokens)

print("Tokens nettoyés:", cleaned_tokens)


Tokens sans stopwords: ['الذكاء', 'الاصطناعي', 'مجال', 'علوم', 'الكمبيوتر', 'يهتم', 'بتطوير', 'الأنظمة', 'الذكية.']
Tokens nettoyés: ['الذكاء', 'الاصطناعي', 'مجال', 'علوم', 'الكمبيوتر', 'يهتم', 'بتطوير', 'الأنظمة', 'الذكية.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
from nltk.stem import PorterStemmer

# Initialiser le stemmer
stemmer = PorterStemmer()

# Appliquer le stemming
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Tokens après stemming:", stemmed_tokens)


Tokens après stemming: ['الذكاء', 'الاصطناعي', 'مجال', 'علوم', 'الكمبيوتر', 'يهتم', 'بتطوير', 'الأنظمة', 'الذكية.']


In [22]:
from nltk.stem import WordNetLemmatizer

# Initialiser le lemmatiseur
lemmatizer = WordNetLemmatizer()

# Appliquer la lemmatisation
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Tokens après lemmatisation:", lemmatized_tokens)


Tokens après lemmatisation: ['الذكاء', 'الاصطناعي', 'مجال', 'علوم', 'الكمبيوتر', 'يهتم', 'بتطوير', 'الأنظمة', 'الذكية.']


In [23]:
from sklearn.preprocessing import KBinsDiscretizer

# Exemple de données numériques continues
data = [[1], [2], [3], [4], [5]]

# Initialiser le discrétiseur
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

# Appliquer la discrétisation
discretized_data = discretizer.fit_transform(data)
print("Données discrétisées:", discretized_data)


Données discrétisées: [[0.]
 [0.]
 [1.]
 [2.]
 [2.]]


In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
import numpy as np

# Exemple de données textuelles
texts = ["Exemple de texte pour la tokenisation", "Un autre exemple de texte"]

# 1. Créer un tokenizer pour transformer les mots en indices
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# 2. Définir vocab_size : le nombre total de mots uniques
vocab_size = len(tokenizer.word_index) + 1  # +1 pour inclure le token de padding

# 3. Convertir les textes en séquences d'indices
sequences = tokenizer.texts_to_sequences(texts)

# 4. Définir max_sequence_length : la longueur maximale des séquences (par exemple, 10)
max_sequence_length = max(len(seq) for seq in sequences)

# 5. Appliquer du padding pour avoir des séquences de même longueur
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# 6. Définir les étiquettes (par exemple, classification binaire)
y_train = np.array([1, 0])  # Exemple d'étiquettes binaires pour deux exemples
y_val = np.array([1, 0])  # Exemple d'étiquettes binaires pour la validation

# Afficher vocab_size et max_sequence_length
print("Vocab Size:", vocab_size)
print("Max Sequence Length:", max_sequence_length)

# Créer et entraîner le modèle RNN
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model_rnn.add(SimpleRNN(64, return_sequences=False))
model_rnn.add(Dense(1, activation='sigmoid'))  # Si c'est un problème de classification binaire

# Compiler et entraîner le modèle
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_rnn.fit(padded_sequences, y_train, epochs=5, batch_size=32, validation_data=(padded_sequences, y_val))


Vocab Size: 9
Max Sequence Length: 6
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 1.0000 - loss: 0.6366 - val_accuracy: 1.0000 - val_loss: 0.5538
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 1.0000 - loss: 0.5538 - val_accuracy: 1.0000 - val_loss: 0.4777
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 1.0000 - loss: 0.4777 - val_accuracy: 1.0000 - val_loss: 0.4073
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 1.0000 - loss: 0.4073 - val_accuracy: 1.0000 - val_loss: 0.3423
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - accuracy: 1.0000 - loss: 0.3423 - val_accuracy: 1.0000 - val_loss: 0.2828


<keras.src.callbacks.history.History at 0x1f2bc6e7190>

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, SimpleRNN, Dense, Embedding

# ===============================
# 2. MODELE BIDIRECTIONNEL RNN
# ===============================
model_bidir_rnn = Sequential()
model_bidir_rnn.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model_bidir_rnn.add(Bidirectional(SimpleRNN(64, return_sequences=False)))
model_bidir_rnn.add(Dense(1, activation='sigmoid'))

model_bidir_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("\nTraining Bidirectional RNN model:")
model_bidir_rnn.fit(padded_sequences, y_train, epochs=5, batch_size=32, validation_data=(padded_sequences, y_val))



Training Bidirectional RNN model:
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 1.0000 - loss: 0.6695 - val_accuracy: 1.0000 - val_loss: 0.5597
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 261ms/step - accuracy: 1.0000 - loss: 0.5597 - val_accuracy: 1.0000 - val_loss: 0.4634
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 1.0000 - loss: 0.4634 - val_accuracy: 1.0000 - val_loss: 0.3785
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 1.0000 - loss: 0.3785 - val_accuracy: 1.0000 - val_loss: 0.3041
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 1.0000 - loss: 0.3041 - val_accuracy: 1.0000 - val_loss: 0.2396


<keras.src.callbacks.history.History at 0x1f2bc60f5d0>

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding

# ===============================
# 3. MODELE GRU
# ===============================
model_gru = Sequential()
model_gru.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model_gru.add(GRU(64, return_sequences=False))
model_gru.add(Dense(1, activation='sigmoid'))

model_gru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("\nTraining GRU model:")
model_gru.fit(padded_sequences, y_train, epochs=5, batch_size=32, validation_data=(padded_sequences, y_val))



Training GRU model:
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.5000 - loss: 0.6989 - val_accuracy: 1.0000 - val_loss: 0.6860
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 1.0000 - loss: 0.6860 - val_accuracy: 1.0000 - val_loss: 0.6733
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 1.0000 - loss: 0.6733 - val_accuracy: 1.0000 - val_loss: 0.6607
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 1.0000 - loss: 0.6607 - val_accuracy: 1.0000 - val_loss: 0.6479
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 1.0000 - loss: 0.6479 - val_accuracy: 1.0000 - val_loss: 0.6349


<keras.src.callbacks.history.History at 0x1f2c00efb10>

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# ===============================
# 4. MODELE LSTM
# ===============================
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model_lstm.add(LSTM(64, return_sequences=False))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("\nTraining LSTM model:")
model_lstm.fit(padded_sequences, y_train, epochs=5, batch_size=32, validation_data=(padded_sequences, y_val))



Training LSTM model:
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 0.6958 - val_accuracy: 1.0000 - val_loss: 0.6898
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 1.0000 - loss: 0.6898 - val_accuracy: 1.0000 - val_loss: 0.6837
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 1.0000 - loss: 0.6837 - val_accuracy: 1.0000 - val_loss: 0.6775
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 1.0000 - loss: 0.6775 - val_accuracy: 1.0000 - val_loss: 0.6711
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - accuracy: 1.0000 - loss: 0.6711 - val_accuracy: 1.0000 - val_loss: 0.6645


<keras.src.callbacks.history.History at 0x1f2c519e8d0>

In [40]:
from sklearn.metrics import classification_report
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
