<a href="https://colab.research.google.com/github/hussainezzi/Arabic-NLP/blob/main/knowledge_Graph_of_Ashaar_Arabic_Peotry_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers torch stanza networkx pyvis arabic-reshaper

import re
import networkx as nx
from pyvis.network import Network
from datasets import load_dataset
import stanza
from transformers import pipeline

# Load the Ashaar dataset
dataset = load_dataset("arbml/ashaar")
poem_verses = [verse['poem verses'] for verse in dataset['train']]

# Initialize Arabic NLP processors
stanza.download('ar')
nlp = stanza.Pipeline('ar', processors='tokenize,mwt,pos,lemma,depparse,ner') # Include 'pos' processor

# Initialize NER pipeline for Arabic
ner_pipeline = pipeline("ner", model="asafaya/bert-base-arabic")

def preprocess_text(text):
    """Clean and normalize Arabic text"""
    text = re.sub(r'[\u064B-\u065F]', '', text)  # Remove diacritics
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.strip()

def extract_entities(verse):
    """Extract entities using both NER and linguistic patterns"""
    entities = []

    # Using BERT NER
    ner_results = ner_pipeline(verse)
    for entity in ner_results:
        if entity['score'] > 0.8:
            entities.append((entity['word'], entity['entity']))

    # Using Stanza for linguistic analysis
    doc = nlp(verse)
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos in ['PROPN', 'NOUN'] and word.text not in [e[0] for e in entities]:
                entities.append((word.text, word.upos))

    return list(set(entities))

def extract_relations(verse, entities):
    """Extract relationships using dependency parsing"""
    relations = []
    doc = nlp(verse)

    for sentence in doc.sentences:
        for word in sentence.words:
            if word.deprel == 'nsubj' and word.head > 0:
                head_word = sentence.words[word.head-1]
                subj = sentence.words[word.id-1].text
                obj = head_word.text
                relations.append((subj, head_word.lemma, obj))

    return relations

# Build knowledge graph
kg = nx.MultiDiGraph()

for poem in poem_verses[:100]:  # Process first 100 poems for demo
    for verse in poem:  # Iterate through each verse in the poem
        cleaned_verse = preprocess_text(verse)  # Pass individual verse to preprocess_text
        entities = extract_entities(cleaned_verse)
        relations = extract_relations(cleaned_verse, entities)

    # Add entities to graph
    for entity, label in entities:
        kg.add_node(entity, label=label, title=entity)

    # Add relations to graph
    for rel in relations:
        source, relation, target = rel
        kg.add_edge(source, target, title=relation, label=relation)

# Visualize the graph
net = Network(notebook=True, cdn_resources='remote', height="800px", width="100%")
net.from_nx(kg)
net.show_buttons(filter_=['physics'])
net.show('arabic_poetry_kg.html')



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ar (Arabic) ...
INFO:stanza:File exists: /root/stanza_resources/ar/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!
Some weights of BertForTokenClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model h