# <span style="color:#FF8888;">🚀 Enhancing Graph-Based Arabic Extractive Text Summarization</span> using <span style="color:#1E90FF;">Semantic</span> and <span style="color:#32CD32;">Statistical</span> Features

# 📥 Install Libraries

In [1]:
!pip install camel-tools
!pip install PyArabic
!pip install KeyBERT
!pip install transformers

Collecting camel-tools
  Downloading camel_tools-1.5.6-py3-none-any.whl.metadata (10 kB)
Collecting docopt (from camel-tools)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<4.44.0,>=4.0 (from camel-tools)
  Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyrsistent (from camel-tools)
  Downloading pyrsistent-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting muddler (from camel-tools)
  Downloading muddler-0.1.3-py3-none-any.whl.metadata (7.5 kB)
Collecting camel-kenlm>=2025.4.8 (from camel-tools)
  Downloading camel-kenlm-2025.4.8.zip (556 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.5/556.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  G

# 📚 Import Libraries

In [2]:
import os
import json
import re
import nltk
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
import math
from collections import defaultdict
import pyarabic.araby as araby
import torch
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm
import logging
import warnings

# 📂 Dataset Path

In [3]:
data_path = "/kaggle/input/escs-dataset"
# Ensure the NLTK stopwords are downloaded
try:
    arabic_stopwords = set(stopwords.words("arabic"))
except:
    nltk.download("stopwords")
    arabic_stopwords = set(stopwords.words("arabic"))

# Output folders
original_sentences_folder = "original_sentences"
preprocessed_dl_folder = "preprocessed_dl"
preprocessed_classical_folder = "preprocessed_classical"

# Ensure directories exist
os.makedirs(original_sentences_folder, exist_ok=True)
os.makedirs(preprocessed_dl_folder, exist_ok=True)
os.makedirs(preprocessed_classical_folder, exist_ok=True)


# 📊 Load the Data

In [4]:
# Load all files
texts = []
file_names = sorted(os.listdir(data_path))  # Ensure correct file order

for file_name in file_names:
    file_path = os.path.join(data_path, file_name)
    with open(file_path, "r", encoding="utf-8") as file:
        texts.append((file_name, file.read()))  # Store filename and content together

print(f"Loaded {len(texts)} documents.")

Loaded 153 documents.


# 🔧 Stemming and Normalization Initalization

In [5]:
def ISRI_Stemmer(text):
    #making an object
    stemmer = ISRIStemmer()
    
    #stemming each word
    text = stemmer.stem(text)
    text = stemmer.pre32(text)
    text = stemmer.suf32(text)
    
    return text

# Arabic normalization function
def normalize_arabic(text):
    text = text.strip()
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    
    #remove repetetions
    text = re.sub("[إأٱآا]", "ا", text)
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ى')
    text = text.replace('ييي', 'ى')
    text = text.replace('اا', 'ا')

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    
    # Remove longation
    text = re.sub(r'(.)\1+', r"\1\1", text) 
    
    #Strip vowels from a text, include Shadda.
    text = araby.strip_tashkeel(text)
    
    #Strip diacritics from a text, include harakats and small lettres The striped marks are
    text = araby.strip_diacritics(text)
    return text

# ⚙️ Pre-Processing Step

In [6]:
# Main processing loop
for file_name, text in tqdm(texts, total=len(texts), desc="Processing documents"):
    paragraphs = text.split("\n")  # Paragraph segmentation
    
    # Filter out empty paragraphs
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    
    original_sentences = {}
    preprocessed_dl_sentences = {}
    preprocessed_classical_sentences = {}
    
    for p_idx, paragraph in enumerate(paragraphs):
        sentences = re.split(r"[.?!]", paragraph)  # Sentence segmentation
        sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty sentences
        
        # Skip paragraphs with no valid sentences
        if not sentences:
            continue
            
        original_sentences[str(p_idx)] = {str(s_idx): s for s_idx, s in enumerate(sentences)}
        
        # Normalize sentences for DL processing
        normalized_sentences = {str(s_idx): normalize_arabic(s) for s_idx, s in enumerate(sentences)}
        preprocessed_dl_sentences[str(p_idx)] = normalized_sentences
        
        # Remove stop words and apply stemming for classical representation
        classical_sentences = {
            str(s_idx): " ".join(
                [ISRI_Stemmer(word) for word in s.split() if word not in arabic_stopwords]
            )
            for s_idx, s in normalized_sentences.items()
        }
        preprocessed_classical_sentences[str(p_idx)] = classical_sentences
    
    # Skip files with no valid content
    if not original_sentences:
        print(f"⚠️ Skipping empty file: {file_name}")
        continue
        
    # Save outputs with the same name as the original file, but with .json extension
    base_filename = file_name.replace(".txt", ".json")  # Replace .txt with .json
    
    with open(os.path.join(original_sentences_folder, base_filename), "w", encoding="utf-8") as f:
        json.dump(original_sentences, f, ensure_ascii=False, indent=4)
    with open(os.path.join(preprocessed_dl_folder, base_filename), "w", encoding="utf-8") as f:
        json.dump(preprocessed_dl_sentences, f, ensure_ascii=False, indent=4)
    with open(os.path.join(preprocessed_classical_folder, base_filename), "w", encoding="utf-8") as f:
        json.dump(preprocessed_classical_sentences, f, ensure_ascii=False, indent=4)

print("Preprocessing completed successfully!")

# Print the content of file1.json after preprocessing
file_to_print = "file54.json"

# Load and print DL-preprocessed version
with open(os.path.join(preprocessed_dl_folder, file_to_print), "r", encoding="utf-8") as f:
    dl_data = json.load(f)
    print("\n📘 Preprocessed for Deep Learning (AraBERT) - " + file_to_print)
    print(json.dumps(dl_data, ensure_ascii=False, indent=4))

# Load and print Classical-preprocessed version
with open(os.path.join(preprocessed_classical_folder, file_to_print), "r", encoding="utf-8") as f:
    classical_data = json.load(f)
    print("\n📗 Preprocessed for Classical Representation (TF-IDF etc.) - " + file_to_print)
    print(json.dumps(classical_data, ensure_ascii=False, indent=4))

Processing documents: 100%|██████████| 153/153 [00:00<00:00, 156.87it/s]

Preprocessing completed successfully!

📘 Preprocessed for Deep Learning (AraBERT) - file54.json
{
    "0": {
        "0": "ارتفاع الدولار"
    },
    "1": {
        "0": "ارتفع الدولار امس بعد ان اظهر تقرير انخفاضا غير متوقع في العجز التجاري الامريكي في شهر مارس"
    },
    "2": {
        "0": "وتقلص العجز الي 54",
        "1": "99 مليار دولار في مارس من الرقم المعدل في فبراير وهو 60",
        "2": "57 مليار دولار وجاء اقل بكثير من توقعات الاقتصادىن بعجز حجمه 61",
        "3": "5 مليار دولار"
    },
    "3": {
        "0": "وعلي مدي السنوات الثلاث الماضيه تقريبا ظل العجز التجاري مصدر ضغط كبير علي الدولار",
        "1": "واذا واصل الامريكيون شراء بضاءع اجنبيه بمعدل اسرع من قدره الشركات الامريكيه علي بيع بضاءعها وخدماتها في الخارج فستظل حركه الدولار الي الخارج قويه مما يفرض ضغوطا علي العمله"
    },
    "4": {
        "0": "وهبط اليورو الي ادني مستويات الجلسه عند 1",
        "1": "2821 دولار بانخفاض حاد عن 1",
        "2": "2875 دولار قبل صدور البيانات بقليل وبانخفاض 0",
        "3": "4% 




# 🗝️ Keyphrase Extraction

In [7]:
# Suppress verbose output
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Suppress tokenizer warnings
warnings.filterwarnings('ignore')  # Suppress general warnings
logging.getLogger("transformers").setLevel(logging.ERROR)  # Suppress transformers logs

# Path to preprocessed files
preprocessed_folder = "/kaggle/working/preprocessed_dl"
output_folder = "/kaggle/working/sentence_scores"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load AraBERT model silently
with tqdm(total=1, desc="Loading AraBERT model", leave=False) as pbar:
    model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02", 
                                     return_dict=True)
    kw_extractor = KeyBERT(model)
    pbar.update(1)

# Get all JSON files in the folder
json_files = [f for f in os.listdir(preprocessed_folder) if f.endswith('.json')]

# Process each file
for json_file in tqdm(json_files, desc="Processing files", leave=True):
    file_path = os.path.join(preprocessed_folder, json_file)
    
    # Load preprocessed sentences
    with open(file_path, "r", encoding="utf-8") as f:
        preprocessed_data = json.load(f)
    
    # Flatten sentences while keeping original IDs
    # Skip the first paragraph (title)
    flattened_sentences = {}
    for para_index, sentences in preprocessed_data.items():
        # Skip paragraph 0 (title)
        if para_index == "0":
            continue
            
        for sent_index, sentence in sentences.items():
            sentence_id = f"P{para_index}-S{sent_index}"
            flattened_sentences[sentence_id] = sentence
    
    # Total number of sentences (N) in this document
    N = len(flattened_sentences)
    if N == 0:
        continue
    
    # Calculate sentence scores directly from KeyBERT scores
    sentence_scores = {}
    for sent_id, sentence in flattened_sentences.items():
        if not sentence or len(sentence.strip()) == 0:
            sentence_scores[sent_id] = 0
            continue
            
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            keywords = kw_extractor.extract_keywords(
                sentence, 
                keyphrase_ngram_range=(1, 3), 
                top_n=5, 
                use_mmr=True, 
                diversity=0.7
            )
        
        # Apply threshold of 0.5 to remove weak keyphrases
        filtered_keywords = [kw for kw in keywords if kw[1] >= 0.5]
        
        # Sum the scores of filtered keyphrases
        score = sum(kw[1] for kw in filtered_keywords)
        
        sentence_scores[sent_id] = score
    
    # Normalize scores for this document
    if sentence_scores:
        max_score = max(sentence_scores.values())
        min_score = min(sentence_scores.values())
        
        # Avoid division by zero during normalization
        score_range = max_score - min_score
        normalized_scores = {}
        
        if score_range > 0:
            for sent_id, score in sentence_scores.items():
                normalized_scores[sent_id] = (score - min_score) / score_range
        else:
            # If all scores are the same, assign a default normalized value
            for sent_id in sentence_scores:
                normalized_scores[sent_id] = 0.5 if max_score > 0 else 0
    else:
        normalized_scores = {}
    
    # Prepare final result with only normalized scores, keeping paragraph/sentence structure
    results = {}
    for para_index, sentences in preprocessed_data.items():
        # Skip paragraph 0 (title)
        if para_index == "0":
            continue
            
        results[para_index] = {}
        for sent_index, _ in sentences.items():
            sent_id = f"P{para_index}-S{sent_index}"
            results[para_index][sent_index] = normalized_scores.get(sent_id, 0)
    
    # Save results to output folder
    output_file = os.path.join(output_folder, json_file)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

print("✅ Processing completed. Normalized scores saved to:", output_folder)
# Print the content of file1.json after processing
file1_path = os.path.join(output_folder, "file1.json")
if os.path.exists(file1_path):
    with open(file1_path, "r", encoding="utf-8") as f:
        file1_content = json.load(f)
    print("📄 Content of file1.json after processing:")
    print(json.dumps(file1_content, ensure_ascii=False, indent=2))
else:
    print("⚠️ file1.json not found in output folder.")

Loading AraBERT model:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing files: 100%|██████████| 153/153 [12:39<00:00,  4.97s/it] 

✅ Processing completed. Normalized scores saved to: /kaggle/working/sentence_scores
📄 Content of file1.json after processing:
{
  "1": {
    "0": 0.022355814390381354,
    "1": 0.39531279353747883,
    "2": 0.09952094683449182,
    "3": 0.3387187676122488
  },
  "2": {
    "0": 0.7082941950028179,
    "1": 0.69007138831486
  },
  "3": {
    "0": 0.03386248356190121,
    "1": 1.0,
    "2": 0.30922412173586317
  },
  "4": {
    "0": 0.04086041705804996,
    "1": 0.10046026676686079,
    "2": 0.11370467781326318,
    "3": 0.02977644185609619,
    "4": 0.7734360323126058,
    "5": 0.038793913206838254
  },
  "5": {
    "0": 0.4143809881645689,
    "1": 0.04936126244598909,
    "2": 0.0,
    "3": 0.29978395641555516,
    "4": 0.3916964117978584,
    "5": 0.36408040578621076,
    "6": 0.15015029118917902
  },
  "6": {
    "0": 0.0015498778884087778,
    "1": 0.3529964305842569,
    "2": 0.02174525643434154,
    "3": 0.036774375352244995,
    "4": 0.09247604734172457,
    "5": 0.4743095998497




# 📏 Sentence Length Score

In [8]:
def calculate_entropy(text):
    """Calculate Shannon entropy of a text string"""
    # Count the frequency of each character
    chars = {}
    for char in text:
        if char in chars:
            chars[char] += 1
        else:
            chars[char] = 1
            
    # Calculate entropy
    length = len(text)
    entropy = 0
    for count in chars.values():
        probability = count / length
        entropy -= probability * math.log2(probability)
    
    return entropy

def calculate_sentence_length_scores(file_path):
    """Calculate sentence length scores for a specific file"""
    with open(file_path, "r", encoding="utf-8") as f:
        preprocessed_data = json.load(f)
    
    # Find the longest sentence in the file (excluding title)
    max_word_count = 0
    for para_index, sentences in preprocessed_data.items():
        # Skip paragraph 0 (title)
        if para_index == "0":
            continue
            
        for _, sentence in sentences.items():
            word_count = len(sentence.split())
            max_word_count = max(max_word_count, word_count)
    
    # Calculate sentence length scores
    paragraph_scores = {}
    
    # Store all scores for normalization
    all_scores = []
    
    for para_index, sentences in preprocessed_data.items():
        # Skip paragraph 0 (title)
        if para_index == "0":
            continue
            
        sentence_scores = {}
        for sent_index, sentence in sentences.items():
            if not sentence or len(sentence.strip()) == 0:
                sentence_scores[sent_index] = 0
                continue
                
            word_count = len(sentence.split())
            entropy = calculate_entropy(sentence)
            
            # Calculate length score = (word count / max word count) * entropy
            length_score = (word_count / max_word_count) * entropy if max_word_count > 0 else 0
            
            sentence_scores[sent_index] = length_score
            all_scores.append(length_score)
            
        paragraph_scores[para_index] = sentence_scores
    
    # Normalize scores within the file
    if all_scores:
        min_score = min(all_scores)
        max_score = max(all_scores)
        score_range = max_score - min_score
        
        # Normalize each score
        if score_range > 0:
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    old_score = paragraph_scores[para_index][sent_index]
                    normalized_score = (old_score - min_score) / score_range
                    paragraph_scores[para_index][sent_index] = normalized_score
        else:
            # If all scores are the same, assign a default normalized value
            default_value = 0.5 if max_score > 0 else 0
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    paragraph_scores[para_index][sent_index] = default_value
    
    return paragraph_scores

# Process all files in the directory
preprocessed_folder = "/kaggle/working/preprocessed_classical"
json_files = [f for f in os.listdir(preprocessed_folder) if f.endswith('.json')]

# Store all results in this dictionary
all_length_scores = {}

# Process each file
for json_file in tqdm(json_files, desc="Calculating sentence length scores"):
    file_path = os.path.join(preprocessed_folder, json_file)
    
    # Calculate length scores for this file
    scores = calculate_sentence_length_scores(file_path)
    
    # Store results in dictionary
    all_length_scores[json_file] = scores

print(f"✅ Completed sentence length scoring for {len(json_files)} files")
print(f"Dictionary structure: {len(all_length_scores)} files with sentence length scores")

# Print sentence length scores for file1.json if it exists
file_name = "file1.json"
if file_name in all_length_scores:
   print(f"\n📂 Sentence length scores for {file_name}:")
   print(json.dumps(all_length_scores[file_name], indent=4, ensure_ascii=False))
else:
   print(f"\n⚠️ {file_name} not found in all_length_scores.")

Calculating sentence length scores: 100%|██████████| 153/153 [00:00<00:00, 1980.99it/s]

✅ Completed sentence length scoring for 153 files
Dictionary structure: 153 files with sentence length scores

📂 Sentence length scores for file1.json:
{
    "1": {
        "0": 0.15496433742876028,
        "1": 0.1012784645871125,
        "2": 0.003521049352458828,
        "3": 0.0298484974453274
    },
    "2": {
        "0": 0.20462843473223988,
        "1": 0.017562921583262454
    },
    "3": {
        "0": 0.24771776956673425,
        "1": 0.0017208755876479136,
        "2": 0.0
    },
    "4": {
        "0": 0.33733690638358843,
        "1": 0.21672022092956875,
        "2": 0.22566138069294261,
        "3": 1.0,
        "4": 0.128471952189861,
        "5": 0.1412206641517197
    },
    "5": {
        "0": 0.32707616210297885,
        "1": 0.4378944863193773,
        "2": 0.0702149055094712,
        "3": 0.29907952062884097,
        "4": 0.11482904026653701,
        "5": 0.11933947984560252,
        "6": 0.07604993227117007
    },
    "6": {
        "0": 0.502609108543862,
     




# 📍 Sentence Location Score

In [9]:
def calculate_sentence_location_scores(file_path):
    """Calculate sentence location scores for a specific file"""
    with open(file_path, "r", encoding="utf-8") as f:
        preprocessed_data = json.load(f)
    
    paragraph_scores = {}
    all_scores = []  # To collect all scores for normalization
    
    for para_index, sentences in preprocessed_data.items():
        # Skip paragraph 0 (title)
        if para_index == "0":
            continue
            
        # Convert paragraph index to integer and add 1 to avoid division by zero
        p_idx = int(para_index)
        
        sentence_scores = {}
        for sent_index, _ in sentences.items():
            # Convert sentence index to integer
            s_idx = int(sent_index) + 1
            
            # Calculate location score based on position
            if s_idx == 1:  # First sentence in paragraph
                location_score = 1 / p_idx
            else:  # Not the first sentence
                location_score = 1 / (p_idx * s_idx)
            
            sentence_scores[sent_index] = location_score
            all_scores.append(location_score)  # Add to collection for normalization
            
        paragraph_scores[para_index] = sentence_scores
    
    # Normalize scores within the file
    if all_scores:
        min_score = min(all_scores)
        max_score = max(all_scores)
        score_range = max_score - min_score
        
        # Normalize each score
        if score_range > 0:
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    old_score = paragraph_scores[para_index][sent_index]
                    normalized_score = (old_score - min_score) / score_range
                    paragraph_scores[para_index][sent_index] = normalized_score
        else:
            # If all scores are the same, assign a default normalized value
            default_value = 0.5 if max_score > 0 else 0
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    paragraph_scores[para_index][sent_index] = default_value
    
    return paragraph_scores

# Process all files in the directory (using either preprocessed_dl or preprocessed_classical)
preprocessed_folder = "/kaggle/working/preprocessed_classical"
json_files = [f for f in os.listdir(preprocessed_folder) if f.endswith('.json')]

# Store all results in this dictionary
all_location_scores = {}

# Process each file
for json_file in tqdm(json_files, desc="Calculating sentence location scores"):
    file_path = os.path.join(preprocessed_folder, json_file)
    
    # Calculate location scores for this file
    scores = calculate_sentence_location_scores(file_path)
    
    # Store results in dictionary
    all_location_scores[json_file] = scores

print(f"✅ Completed sentence location scoring for {len(json_files)} files")
print(f"Dictionary structure: {len(all_location_scores)} files with sentence location scores")

# Print sentence length scores for file1.json if it exists
file_name = "file1.json"
if file_name in all_location_scores:
   print(f"\n📂 Sentence location scores for {file_name}:")
   print(json.dumps(all_location_scores[file_name], indent=4, ensure_ascii=False))
else:
   print(f"\n⚠️ {file_name} not found in all_location_scores.")

Calculating sentence location scores: 100%|██████████| 153/153 [00:00<00:00, 10863.13it/s]

✅ Completed sentence location scoring for 153 files
Dictionary structure: 153 files with sentence location scores

📂 Sentence location scores for file1.json:
{
    "1": {
        "0": 1.0,
        "1": 0.4893617021276596,
        "2": 0.3191489361702128,
        "3": 0.23404255319148937
    },
    "2": {
        "0": 0.4893617021276596,
        "1": 0.23404255319148937
    },
    "3": {
        "0": 0.3191489361702128,
        "1": 0.14893617021276595,
        "2": 0.0921985815602837
    },
    "4": {
        "0": 0.23404255319148937,
        "1": 0.10638297872340427,
        "2": 0.06382978723404256,
        "3": 0.04255319148936171,
        "4": 0.029787234042553196,
        "5": 0.02127659574468085
    },
    "5": {
        "0": 0.18297872340425533,
        "1": 0.08085106382978725,
        "2": 0.04680851063829788,
        "3": 0.029787234042553196,
        "4": 0.019574468085106385,
        "5": 0.012765957446808512,
        "6": 0.007902735562310031
    },
    "6": {
        "0":




# 🔑 Cue Words Score

In [10]:
def load_cue_words(file_path):
    """Load cue words from file and normalize them"""
    with open(file_path, "r", encoding="utf-8") as f:
        cue_words = [normalize_arabic(line.strip()) for line in f.readlines()]
    return cue_words

def count_cue_words(text, cue_words):
    """Count cue words in normalized text"""
    count = 0
    for cue_word in cue_words:
        # Count occurrences of the cue word in the normalized text
        count += text.count(cue_word)
    return count

def calculate_cue_word_scores(file_path, cue_words):
    """Calculate cue word scores for a specific file"""
    with open(file_path, "r", encoding="utf-8") as f:
        preprocessed_data = json.load(f)
    
    paragraph_scores = {}
    all_scores = []  # To collect all scores for normalization
    
    for para_index, sentences in preprocessed_data.items():
        # Skip paragraph 0 (title)
        if para_index == "0":
            continue
            
        # Count total cue words in the paragraph
        paragraph_cue_count = 0
        for _, sentence in sentences.items():
            paragraph_cue_count += count_cue_words(sentence, cue_words)
        
        sentence_scores = {}
        for sent_index, sentence in sentences.items():
            # Count cue words in this sentence
            sentence_cue_count = count_cue_words(sentence, cue_words)
            
            # Calculate cue word score
            if paragraph_cue_count > 0:
                cue_score = sentence_cue_count / paragraph_cue_count
            else:
                cue_score = 0
            
            sentence_scores[sent_index] = cue_score
            all_scores.append(cue_score)  # Add to collection for normalization
            
        paragraph_scores[para_index] = sentence_scores
    
    # Normalize scores within the file
    if all_scores:
        min_score = min(all_scores)
        max_score = max(all_scores)
        score_range = max_score - min_score
        
        # Normalize each score
        if score_range > 0:
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    old_score = paragraph_scores[para_index][sent_index]
                    normalized_score = (old_score - min_score) / score_range
                    paragraph_scores[para_index][sent_index] = normalized_score
        else:
            # If all scores are the same, assign a default normalized value
            default_value = 0.5 if max_score > 0 else 0
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    paragraph_scores[para_index][sent_index] = default_value
    
    return paragraph_scores

# Load cue words and normalize them
cue_words_path = "/kaggle/input/cue-words-list/arabic_cue_words.txt"
cue_words = load_cue_words(cue_words_path)
print(f"Loaded {len(cue_words)} normalized cue words")

# For cue words, let's use the original_sentences folder since we want to match exact phrases
# This will give us the most accurate count of cue words
preprocessed_folder = "/kaggle/working/preprocessed_dl"
json_files = [f for f in os.listdir(preprocessed_folder) if f.endswith('.json')]

# Store all results in this dictionary
all_cue_scores = {}

# Process each file
for json_file in tqdm(json_files, desc="Calculating cue word scores"):
    file_path = os.path.join(preprocessed_folder, json_file)
    
    # Calculate cue word scores for this file
    scores = calculate_cue_word_scores(file_path, cue_words)
    
    # Store results in dictionary
    all_cue_scores[json_file] = scores

print(f"✅ Completed cue word scoring for {len(json_files)} files")
print(f"Dictionary structure: {len(all_cue_scores)} files with cue word scores")

# Print sentence length scores for file1.json if it exists
file_name = "file13.json"
if file_name in all_cue_scores:
    print(f"\n📂 Sentence cue word scores for {file_name}:")
    print(json.dumps(all_cue_scores[file_name], indent=4, ensure_ascii=False))
else:
    print(f"\n⚠️ {file_name} not found in all_cue_scores.")

Loaded 80 normalized cue words


Calculating cue word scores: 100%|██████████| 153/153 [00:00<00:00, 1000.17it/s]

✅ Completed cue word scoring for 153 files
Dictionary structure: 153 files with cue word scores

📂 Sentence cue word scores for file13.json:
{
    "1": {
        "0": 0.3333333333333333,
        "1": 0.3333333333333333,
        "2": 0.3333333333333333
    },
    "2": {
        "0": 1.0,
        "1": 0.0
    },
    "3": {
        "0": 1.0,
        "1": 0.0,
        "2": 0.0,
        "3": 0.0
    },
    "4": {
        "0": 0.0,
        "1": 0.0,
        "2": 0.5,
        "3": 0.5
    },
    "5": {
        "0": 1.0
    },
    "6": {
        "0": 0.0
    },
    "7": {
        "0": 0.0
    }
}





# 🔢 Numarical Data Score

In [11]:
def count_numerical_data(text):
    """Count numerical data in text (both Arabic and English digits)"""
    # Match both Arabic and English digits
    pattern = r'[\u0660-\u0669\d]+'
    matches = re.findall(pattern, text)
    return len(matches)

def calculate_numerical_data_scores(file_path):
    """Calculate numerical data scores for a specific file"""
    with open(file_path, "r", encoding="utf-8") as f:
        preprocessed_data = json.load(f)
    
    paragraph_scores = {}
    all_scores = []  # To collect all scores for normalization
    
    for para_index, sentences in preprocessed_data.items():
        # Skip paragraph 0 (title)
        if para_index == "0":
            continue
            
        # Count total numerical data in the paragraph
        paragraph_numerical_count = 0
        for _, sentence in sentences.items():
            paragraph_numerical_count += count_numerical_data(sentence)
        
        sentence_scores = {}
        for sent_index, sentence in sentences.items():
            # Count numerical data in this sentence
            sentence_numerical_count = count_numerical_data(sentence)
            
            # Calculate numerical data score
            if paragraph_numerical_count > 0:
                numerical_score = sentence_numerical_count / paragraph_numerical_count
            else:
                numerical_score = 0
            
            sentence_scores[sent_index] = numerical_score
            all_scores.append(numerical_score)  # Add to collection for normalization
            
        paragraph_scores[para_index] = sentence_scores
    
    # Normalize scores within the file
    if all_scores:
        min_score = min(all_scores)
        max_score = max(all_scores)
        score_range = max_score - min_score
        
        # Normalize each score
        if score_range > 0:
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    old_score = paragraph_scores[para_index][sent_index]
                    normalized_score = (old_score - min_score) / score_range
                    paragraph_scores[para_index][sent_index] = normalized_score
        else:
            # If all scores are the same, assign a default normalized value
            default_value = 0.5 if max_score > 0 else 0
            for para_index in paragraph_scores:
                for sent_index in paragraph_scores[para_index]:
                    paragraph_scores[para_index][sent_index] = default_value
    
    return paragraph_scores

# Process all files in the directory
preprocessed_folder = "/kaggle/working/original_sentences"
json_files = [f for f in os.listdir(preprocessed_folder) if f.endswith('.json')]

# Store all results in this dictionary
all_numerical_scores = {}

# Process each file
for json_file in tqdm(json_files, desc="Calculating numerical data scores"):
    file_path = os.path.join(preprocessed_folder, json_file)
    
    # Calculate numerical data scores for this file
    scores = calculate_numerical_data_scores(file_path)
    
    # Store results in dictionary
    all_numerical_scores[json_file] = scores

print(f"✅ Completed numerical data scoring for {len(json_files)} files")
print(f"Dictionary structure: {len(all_numerical_scores)} files with numerical data scores")

# Print sentence length scores for file1.json if it exists
file_name = "file1.json"
if file_name in all_numerical_scores:
    print(f"\n📂 Sentence numerical data scores for {file_name}:")
    print(json.dumps(all_numerical_scores[file_name], indent=4, ensure_ascii=False))
else:
    print(f"\n⚠️ {file_name} not found in all_numerical_scores.")

Calculating numerical data scores: 100%|██████████| 153/153 [00:00<00:00, 3360.49it/s]

✅ Completed numerical data scoring for 153 files
Dictionary structure: 153 files with numerical data scores

📂 Sentence numerical data scores for file1.json:
{
    "1": {
        "0": 0.5,
        "1": 0.0,
        "2": 0.0,
        "3": 0.5
    },
    "2": {
        "0": 0.0,
        "1": 0.0
    },
    "3": {
        "0": 0.0,
        "1": 0.0,
        "2": 1.0
    },
    "4": {
        "0": 0.8,
        "1": 0.2,
        "2": 0.0,
        "3": 0.0,
        "4": 0.0,
        "5": 0.0
    },
    "5": {
        "0": 1.0,
        "1": 0.0,
        "2": 0.0,
        "3": 0.0,
        "4": 0.0,
        "5": 0.0,
        "6": 0.0
    },
    "6": {
        "0": 1.0,
        "1": 0.0,
        "2": 0.0,
        "3": 0.0,
        "4": 0.0,
        "5": 0.0,
        "6": 0.0,
        "7": 0.0
    }
}





# 📊 Final Statistical Score

In [12]:
# Combine all statistical scores (length, location, cue, numerical) and normalize
def combine_and_normalize_statistical_scores():
    """
    Combine length, location, cue word, and numerical data scores.
    Then normalize the combined scores within each document.
    """
    # Dictionary to store all combined scores
    all_combined_scores = {}
    
    # Get all files from one of the dictionaries (they should all have the same files)
    json_files = all_length_scores.keys()
    
    # Process each file
    for json_file in tqdm(json_files, desc="Combining statistical scores"):
        combined_scores = {}
        
        # Process paragraphs and sentences
        for para_index in all_length_scores[json_file]:
            combined_scores[para_index] = {}
            
            for sent_index in all_length_scores[json_file][para_index]:
                # Sum all four statistical scores
                length_score = all_length_scores[json_file][para_index].get(sent_index, 0)
                location_score = all_location_scores[json_file][para_index].get(sent_index, 0)
                cue_score = all_cue_scores[json_file][para_index].get(sent_index, 0)
                numerical_score = all_numerical_scores[json_file][para_index].get(sent_index, 0)
                
                # Calculate total score (simple sum)
                total_score = length_score + location_score + cue_score + numerical_score
                
                # Store the combined score
                combined_scores[para_index][sent_index] = total_score
        
        # Normalize scores for this document
        # Find min and max scores across all sentences in the document
        # min_score = float('inf')
        # max_score = float('-inf')
        
        # for para_scores in combined_scores.values():
        #     for score in para_scores.values():
        #         min_score = min(min_score, score)
        #         max_score = max(max_score, score)
        
        # Normalize scores (handle case where all scores are the same)
        # normalized_scores = {}
        # score_range = max_score - min_score
        
        # for para_index, para_scores in combined_scores.items():
        #     normalized_scores[para_index] = {}
            
        #     for sent_index, score in para_scores.items():
        #         if score_range > 0:
        #             normalized_score = (score - min_score) / score_range
        #         else:
        #             # If all scores are the same, assign a default value
        #             normalized_score = 0.5 if max_score > 0 else 0
                
        #         normalized_scores[para_index][sent_index] = normalized_score
        
        # Store normalized scores in the final dictionary
        all_combined_scores[json_file] = combined_scores
    
    return all_combined_scores

# Calculate combined and normalized statistical scores
all_statistical_scores = combine_and_normalize_statistical_scores()

print(f"✅ Completed combining statistical scores for {len(all_statistical_scores)} files")
print(f"Dictionary structure: {len(all_statistical_scores)} files with combined statistical scores")

# Print sentence length scores for file1.json if it exists
file_name = "file1.json"
if file_name in all_statistical_scores:
    print(f"\n📂 Sentence  statistical scores for {file_name}:")
    print(json.dumps(all_statistical_scores[file_name], indent=4, ensure_ascii=False))
else:
    print(f"\n⚠️ {file_name} not found in all_statistical_scores.")

Combining statistical scores: 100%|██████████| 153/153 [00:00<00:00, 39512.87it/s]

✅ Completed combining statistical scores for 153 files
Dictionary structure: 153 files with combined statistical scores

📂 Sentence  statistical scores for file1.json:
{
    "1": {
        "0": 1.6549643374287604,
        "1": 0.5906401667147722,
        "2": 0.3226699855226716,
        "3": 0.7638910506368168
    },
    "2": {
        "0": 0.6939901368598995,
        "1": 0.2516054747747518
    },
    "3": {
        "0": 0.566866705736947,
        "1": 0.15065704580041386,
        "2": 1.0921985815602837
    },
    "4": {
        "0": 1.371379459575078,
        "1": 0.523103199652973,
        "2": 0.28949116792698515,
        "3": 1.0425531914893618,
        "4": 0.1582591862324142,
        "5": 0.16249725989640054
    },
    "5": {
        "0": 2.3100548855072343,
        "1": 1.3187455501491647,
        "2": 0.11702341614776907,
        "3": 0.32886675467139415,
        "4": 0.13440350835164339,
        "5": 0.532105437292411,
        "6": 0.0839526678334801
    },
    "6": {
      




# 🗝️📊 Store both Keyphrase and statistical Scores

In [13]:
# Add statistical scores to the existing keyphrase score files
def combine_keyphrase_and_statistical_scores():
    """
    Read keyphrase score files, add statistical scores, and save the updated files.
    Each sentence will have two scores: keyphrase score and statistical score.
    """
    # Path to the folder containing keyphrase scores
    keyphrase_scores_folder = "/kaggle/working/sentence_scores"
    
    # Create output folder if it doesn't exist (we'll use the same folder)
    os.makedirs(keyphrase_scores_folder, exist_ok=True)
    
    # Get all JSON files in the keyphrase scores folder
    json_files = [f for f in os.listdir(keyphrase_scores_folder) if f.endswith('.json')]
    
    # Process each file
    for json_file in tqdm(json_files, desc="Adding statistical scores to keyphrase files"):
        # Skip files that don't exist in the statistical scores dictionary
        if json_file not in all_statistical_scores:
            print(f"⚠️ Skipping {json_file} - not found in statistical scores.")
            continue
        
        # Load the keyphrase scores file
        file_path = os.path.join(keyphrase_scores_folder, json_file)
        with open(file_path, "r", encoding="utf-8") as f:
            keyphrase_scores = json.load(f)
        
        # Create a new dictionary to store both scores
        combined_scores = {}
        
        # Process each paragraph and sentence
        for para_index, para_keyphrase_scores in keyphrase_scores.items():
            combined_scores[para_index] = {}
            
            for sent_index, keyphrase_score in para_keyphrase_scores.items():
                # Get the statistical score for this sentence
                statistical_score = all_statistical_scores.get(json_file, {}).get(para_index, {}).get(sent_index, 0)
                
                # Store both scores in a dictionary
                combined_scores[para_index][sent_index] = {
                    "keyphrase_score": keyphrase_score,
                    "statistical_score": statistical_score
                }
        
        # Save the updated scores to the same file
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(combined_scores, f, ensure_ascii=False, indent=2)
    
    return f"✅ Added statistical scores to {len(json_files)} keyphrase score files."

# Execute the function
result_message = combine_keyphrase_and_statistical_scores()
print(result_message)

# Verify structure of the updated files
sample_file = [f for f in os.listdir("/kaggle/working/sentence_scores") if f.endswith('.json')][0]
with open(os.path.join("/kaggle/working/sentence_scores", sample_file), "r", encoding="utf-8") as f:
    sample_data = json.load(f)
# Verify structure of file1.json after adding statistical scores
file1_path = os.path.join("/kaggle/working/sentence_scores", "file1.json")
if os.path.exists(file1_path):
    with open(file1_path, "r", encoding="utf-8") as f:
        file1_content = json.load(f)
    print("📄 Content of file1.json after adding statistical scores:")
    print(json.dumps(file1_content, ensure_ascii=False, indent=2))
else:
    print("⚠️ file1.json not found in sentence_scores folder.")

Adding statistical scores to keyphrase files: 100%|██████████| 153/153 [00:00<00:00, 2574.15it/s]

✅ Added statistical scores to 153 keyphrase score files.
📄 Content of file1.json after adding statistical scores:
{
  "1": {
    "0": {
      "keyphrase_score": 0.022355814390381354,
      "statistical_score": 1.6549643374287604
    },
    "1": {
      "keyphrase_score": 0.39531279353747883,
      "statistical_score": 0.5906401667147722
    },
    "2": {
      "keyphrase_score": 0.09952094683449182,
      "statistical_score": 0.3226699855226716
    },
    "3": {
      "keyphrase_score": 0.3387187676122488,
      "statistical_score": 0.7638910506368168
    }
  },
  "2": {
    "0": {
      "keyphrase_score": 0.7082941950028179,
      "statistical_score": 0.6939901368598995
    },
    "1": {
      "keyphrase_score": 0.69007138831486,
      "statistical_score": 0.2516054747747518
    }
  },
  "3": {
    "0": {
      "keyphrase_score": 0.03386248356190121,
      "statistical_score": 0.566866705736947
    },
    "1": {
      "keyphrase_score": 1.0,
      "statistical_score": 0.15065704580041




## 📄📌 Here the folder "/kaggle/working/sentence_scores" must have both keyphrase scores and statistical scores for each sentence in each file