In [73]:
import requests
from bs4 import BeautifulSoup

# Function to scrape text from a website
def scrape_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Modify this based on the structure of the webpage to extract relevant text
    text = ' '.join([p.text for p in soup.find_all('p')])
    return text

# Define URLs of Arabic websites related to your topic
urls = [
    'https://alkhalilarabic.com/',
    'https://guidetoarabic.net/ar'
]

# Define a function to calculate text score based on some criteria
def calculate_text_score(text):
    return len(text) / 1000  # Just a simple scaling factor for demonstration

# Create a dictionary to store text and corresponding scores
data = {}
texts =[]
scores =[]

# Scrape text from each URL and calculate the score
for url in urls:
    text = scrape_text(url)
    texts.append(text)
    score = calculate_text_score(text)
    scores.append(score)
    data[url] = {'text': text, 'score': score}

# Print the data
for url, info in data.items():
    print(f"URL: {url}")
    print(f"Text:\n{info['text']}")
    print(f"Score: {info['score']}\n")
    
print(texts)
print(scores)

URL: https://alkhalilarabic.com/
Text:
الخليل لتعليم اللغة العربية اكتسب المهارات اللغوية وتعرف على الثقافة العربية لتفتح عالمك مع منصة الخليل. تعلم العربية في دورات عن بعد على أيدي خبراء انطلق في تعلم العربية وصقل مهاراتك اللغوية في القراءة والكتابة والمحادثة والاستماع من خلال الدورات الأساسية على منصة الخليل تعلم اللغة العربية من الخبراء في اللغة الناطقين بالعربية، وانغمس في ثقافتها، وتعرف على قيمها وحضارتها. تعلم اللغة العربية باحترافية وطوّر مهاراتك اللغوية عبر منصتنا التعليمية المتطورة وأنت في أي موقع من العالم. بإمكانك ممارسة وتطوير مهارة المحادثة لساعات إضافية مع أفضل المدرسين العرب، وفي الأوقات التي تناسبك. اكتسب مهارات لغوية جديدة ترتبط بمجال عملك أو دراستك أو اهتمامك اللغوي في المجالات الدبلوماسية أو الإعلامية أو الأكاديمية أو غيرها أحد مشاريع شركة مؤتلف (إبانة)
Score: 0.743

URL: https://guidetoarabic.net/ar
Text:
اللغة العربية هي لغة القرآن الكريم، وهي وسيلة لفهم النصوص الشرعية، والاستنباط الصحيح من النصوص، وتعلم اللغة العربية واجب على المسلمين لفهم القرآن الكريم والسنة الن

In [74]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from torch.nn.utils.rnn import pad_sequence

# Define your RNN-based model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, _ = self.rnn(x)
        # Ensure that the output has the correct shape
        if len(out.shape) == 3:
            # Take the last output of the sequence
            out = out[:, -1, :]
        else:
            # Reshape the output if it's 2-dimensional
            out = out.squeeze(1)
        out = self.fc(out)
        return out


# Define your dataset class
class ArabicTextDataset(Dataset):
    def __init__(self, texts, scores):
        self.texts = texts
        self.scores = scores
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.scores[idx]

# Preprocessing functions
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('arabic'))
    return [word for word in tokens if word.lower() not in stop_words]

# Preprocessing pipeline
processed_texts = []
for text in texts:
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    processed_texts.append(tokens)

# Convert text to numerical representation (dummy representation)
word_to_idx = {word: idx for idx, word in enumerate(set(np.hstack(processed_texts)))}
numerical_texts = [[word_to_idx[word] for word in text] for text in processed_texts]

print(word_to_idx)

# Pad sequences to ensure consistent length
padded_sequences = pad_sequence([torch.tensor(text) for text in numerical_texts], batch_first=True, padding_value=0)
# Ensure all sequences have the same length (pad to length 10)
padded_sequences = nn.functional.pad(padded_sequences, (0, 184 - padded_sequences.size(1)))

# Convert to PyTorch tensors
inputs = padded_sequences.float()
targets = torch.tensor(scores, dtype=torch.float32)

# Define input_size after padding
input_size = len(word_to_idx)

# Split data into train and test sets
train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.2, random_state=42)

# Define hyperparameters
hidden_size = 128
output_size = 1
learning_rate = 0.001
num_epochs = 10
batch_size = 1

# Create DataLoader
train_dataset = ArabicTextDataset(train_inputs, train_targets)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model, loss function, and optimizer
model = RNNModel(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
with torch.no_grad():
    test_outputs = model(test_inputs)
    # Reshape the target tensor to match the output tensor size
    test_targets_resized = test_targets.view(-1, 1)
    test_loss = criterion(test_outputs, test_targets_resized)
    print(f'Test Loss: {test_loss.item():.4f}')


{'الخبراء': 0, 'وأفكار،': 1, 'الجامعات': 2, 'الدورات': 3, 'العالم،': 4, 'النبوية،': 5, 'وطرق': 6, 'اللغوي': 7, 'النصوص،': 8, 'الاحتياج': 9, 'للعاملين': 10, 'أفضل': 11, 'مؤتلف': 12, 'والاستماع': 13, 'التعليمية': 14, 'الإعجاز،': 15, 'الأسباب': 16, 'الدليل': 17, 'ممارسة': 18, 'وعلم': 19, 'دلائل': 20, 'أقرب': 21, 'يتعلق': 22, 'المجال،': 23, 'عالمك': 24, 'العرب،': 25, 'بد': 26, 'الأكاديمية': 27, 'اللغة': 28, 'عملك': 29, 'الإعلامية': 30, 'المشكلات': 31, 'ترشيد': 32, 'ومعاهد': 33, 'والطُّرق': 34, 'لتعلمها': 35, 'لساعات': 36, '(': 37, 'ومكانتها': 38, 'وعناوين': 39, 'القراءة': 40, 'لتفتح': 41, 'إبانة': 42, 'مهاراتك': 43, 'تظهر': 44, 'معها': 45, 'البيان': 46, 'العلوم': 47, 'دراستك': 48, 'والسنة': 49, 'ترتبط': 50, 'دراسة': 51, 'نوفِّر': 52, 'أصول': 53, 'المتَّبعة': 54, 'وانغمس': 55, 'لخدمات': 56, 'الجميع': 57, 'تدعوك': 58, 'إضاءات': 59, 'وتطوير': 60, 'كيفية': 61, 'الشرعية،': 62, 'أعمالهم،': 63, 'الرقمية': 64, 'دليل': 65, 'أيدي': 66, 'والوقوف': 67, 'إضافية': 68, 'القسم': 69, 'والمحادثة': 70, 'عبر'

  return F.mse_loss(input, target, reduction=self.reduction)


In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from torch.nn.utils.rnn import pad_sequence

# Define your Bidirectional RNN-based model with GRU
class BiGRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiGRUModel, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # multiplied by 2 because of bidirectional GRU
    
    def forward(self, x):
        out, _ = self.rnn(x)
        # Ensure that the output has the correct shape
        if len(out.shape) == 3:
            # Take the last output of the sequence
            out = out[:, -1, :]
        else:
            # Reshape the output if it's 2-dimensional
            out = out.squeeze(1)
        out = self.fc(out)
        return out


# Define your dataset class
class ArabicTextDataset(Dataset):
    def __init__(self, texts, scores):
        self.texts = texts
        self.scores = scores
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.scores[idx]

# Preprocessing functions
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('arabic'))
    return [word for word in tokens if word.lower() not in stop_words]

# Preprocessing pipeline
processed_texts = []
for text in texts:
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    processed_texts.append(tokens)

# Convert text to numerical representation (dummy representation)
word_to_idx = {word: idx for idx, word in enumerate(set(np.hstack(processed_texts)))}
numerical_texts = [[word_to_idx[word] for word in text] for text in processed_texts]

# Pad sequences to ensure consistent length
padded_sequences = pad_sequence([torch.tensor(text) for text in numerical_texts], batch_first=True, padding_value=0)
# Ensure all sequences have the same length (pad to length 10)
padded_sequences = nn.functional.pad(padded_sequences, (0, 184 - padded_sequences.size(1)))

# Convert to PyTorch tensors
inputs = padded_sequences.float()
targets = torch.tensor(scores, dtype=torch.float32)

# Define input_size after padding
input_size = len(word_to_idx)

# Split data into train and test sets
train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.2, random_state=42)

# Define hyperparameters
hidden_size = 128
output_size = 1
learning_rate = 0.001
num_epochs = 10
batch_size = 1

# Create DataLoader
train_dataset = ArabicTextDataset(train_inputs, train_targets)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model, loss function, and optimizer
model = BiGRUModel(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
with torch.no_grad():
    test_outputs = model(test_inputs)
    # Reshape the target tensor to match the output tensor size
    test_targets_resized = test_targets.view(-1, 1)
    test_loss = criterion(test_outputs, test_targets_resized)
    print(f'Test Loss: {test_loss.item():.4f}')

Epoch [1/10], Loss: 0.9155
Epoch [2/10], Loss: 0.0563
Epoch [3/10], Loss: 0.1458
Epoch [4/10], Loss: 0.0191
Epoch [5/10], Loss: 0.0189
Epoch [6/10], Loss: 0.0204
Epoch [7/10], Loss: 0.0191
Epoch [8/10], Loss: 0.0159
Epoch [9/10], Loss: 0.0116
Epoch [10/10], Loss: 0.0073
Test Loss: 1.6315


In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from torch.nn.utils.rnn import pad_sequence

# Define your Bidirectional RNN-based model with LSTM
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # multiplied by 2 because of bidirectional LSTM
    
    def forward(self, x):
        out, _ = self.lstm(x)
        # Ensure that the output has the correct shape
        if len(out.shape) == 3:
            # Take the last output of the sequence
            out = out[:, -1, :]
        else:
            # Reshape the output if it's 2-dimensional
            out = out.squeeze(1)
        out = self.fc(out)
        return out


# Define your dataset class
class ArabicTextDataset(Dataset):
    def __init__(self, texts, scores):
        self.texts = texts
        self.scores = scores
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.scores[idx]

# Preprocessing functions
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('arabic'))
    return [word for word in tokens if word.lower() not in stop_words]

# Preprocessing pipeline
processed_texts = []
for text in texts:
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    processed_texts.append(tokens)

# Convert text to numerical representation (dummy representation)
word_to_idx = {word: idx for idx, word in enumerate(set(np.hstack(processed_texts)))}
numerical_texts = [[word_to_idx[word] for word in text] for text in processed_texts]

# Pad sequences to ensure consistent length
padded_sequences = pad_sequence([torch.tensor(text) for text in numerical_texts], batch_first=True, padding_value=0)
# Ensure all sequences have the same length (pad to length 10)
padded_sequences = nn.functional.pad(padded_sequences, (0, 184 - padded_sequences.size(1)))

# Convert to PyTorch tensors
inputs = padded_sequences.float()
targets = torch.tensor(scores, dtype=torch.float32)

# Define input_size after padding
input_size = len(word_to_idx)

# Split data into train and test sets
train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.2, random_state=42)

# Define hyperparameters
hidden_size = 128
output_size = 1
learning_rate = 0.001
num_epochs = 10
batch_size = 1

# Create DataLoader
train_dataset = ArabicTextDataset(train_inputs, train_targets)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model, loss function, and optimizer
model = BiLSTMModel(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
with torch.no_grad():
    test_outputs = model(test_inputs)
    # Reshape the target tensor to match the output tensor size
    test_targets_resized = test_targets.view(-1, 1)
    test_loss = criterion(test_outputs, test_targets_resized)
    print(f'Test Loss: {test_loss.item():.4f}')

Epoch [1/10], Loss: 1.6645
Epoch [2/10], Loss: 0.2008
Epoch [3/10], Loss: 0.0281
Epoch [4/10], Loss: 0.0001
Epoch [5/10], Loss: 0.0015
Epoch [6/10], Loss: 0.0038
Epoch [7/10], Loss: 0.0061
Epoch [8/10], Loss: 0.0082
Epoch [9/10], Loss: 0.0096
Epoch [10/10], Loss: 0.0104
Test Loss: 2.0562
