<h2 style="text-align: center">- Lab 1 : Scraping and NLP
Pipeline -</h2>

#  Scraping from arabic site, and Store Data in MongoDb 

In [None]:
from bs4 import BeautifulSoup
import requests
import pymongo
import re

url = 'https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D9%88%D9%84%D8%A7%D9%8A%D8%A7%D8%AA_%D8%A7%D9%84%D9%85%D8%AA%D8%AD%D8%AF%D8%A9'

response = requests.get(url)
html_content = response.content

soup = BeautifulSoup(html_content, 'html.parser')

paragraphs = soup.find_all('p')

# Extract text from paragraphs and preprocess
cleaned_paragraphs = []
for paragraph in paragraphs:
    # Remove non-Arabic characters and extra whitespace
    cleaned_text = re.sub(r'[^\u0600-\u06FF\s]', '', paragraph.text.strip())
    cleaned_paragraphs.append(cleaned_text)

# MongoDB connection
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['arabic_infos'] 
collection = db['raw_data_1']

# Insert cleaned paragraphs into MongoDB
data_to_insert = [{'paragraph': p} for p in cleaned_paragraphs]
collection.insert_many(data_to_insert)

print(f'{len(cleaned_paragraphs)} paragraphs inserted into MongoDB.')


# Text Cleaning, including : Tokenization, Stop Words Removal, Normalisation, discretisation, Stemming, Lemmatization

In [2]:
import re
import qalsadi.lemmatizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import ISRIStemmer
from nltk.tokenize import word_tokenize
import pymongo

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):

    # Discretization
    discretized_text = re.sub(r'\d+', '<رقم>', text)

    # Text Cleaning
    cleaned_text = re.sub(r'[^\w\s]', '', discretized_text)  
    cleaned_text = cleaned_text.lower() 

    # Tokenization
    tokens = word_tokenize(cleaned_text)
    
    # Stop Words Removal
    stop_words = set(stopwords.words('arabic'))  
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    lemmer = nltk.stem.ISRIStemmer()
    stemmed_tokens = [lemmer.stem(token) for token in filtered_tokens]

    # Lemmatization using qalsadi library
    lemmer = qalsadi.lemmatizer.Lemmatizer()
    lemmas = lemmer.lemmatize_text(' '.join(filtered_tokens))

    return lemmas, stemmed_tokens

client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['arabic_news']  
collection = db['raw_data_1']  

data_from_mongodb = [document['paragraph'] for document in collection.find()]

preprocessed_data = [preprocess_text(paragraph) for paragraph in data_from_mongodb]

i = 0
# Print preprocessed data
for lemmas, stemmed_tokens in preprocessed_data:
    if i == 5:
        break
    print()
    print("(************ Lemmatization ************)")
    print(lemmas)
    print("(************    Stemming   ************)")
    print(stemmed_tokens)
    i += 1

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



(************ Lemmatization ************)
['ولاية', 'متحد', 'أمريكي', 'اختصار', 'أمريكا', 'بالإنجليزية', 'يونايتد', 'ستيتس', 'وفى', 'أميركا', 'هي', 'جمهور', 'دستور', 'اتحاد', 'ضام', 'ولاية', 'منطق', 'عاصم', 'اتحاد', 'أقعى', 'معظم', 'بلاد', 'وسط', 'أمريكا', 'شمال', 'أقعى', 'ولاية', 'وواشنطن', 'عاصم', 'محيط', 'هادئ', 'محيط', 'أطلس', 'تحدى', 'كند', 'شمال', 'مكسيك', 'جنوب', 'أقعى', 'ولاية', 'ألاسكا', 'شمال', 'غرب', 'قار', 'تحدى', 'كند', 'شرق', 'روس', 'غرب', 'عبر', 'مضيق', 'بيرينغ', 'ولاية', 'هاواي', 'تعدي', 'أرخبيل', 'أقعى', 'منتصف', 'محيط', 'هادئ', 'ضام', 'دول', 'عدد', 'أراضي', 'جزر', 'كاريبي', 'محيط', 'هادئ']
(************    Stemming   ************)
['ولي', 'تحد', 'امر', 'واختصارا', 'امر', 'انجليزية', 'يتد', 'يتس', 'اوف', 'امر', 'وهي', 'جمهور', 'دستور', 'تحد', 'تضم', 'ولي', 'نطق', 'عصم', 'تحد', 'تقع', 'عظم', 'بلد', 'وسط', 'امر', 'شمل', 'تقع', 'ولي', 'شنط', 'عصم', 'حيط', 'هدئ', 'حيط', 'طلس', 'تحد', 'كند', 'شمل', 'كسك', 'جنب', 'تقع', 'ولي', 'لسك', 'شمل', 'غرب', 'قرة', 'تحد', 'كند', 'شرق'

#### <b>a brief comparison on Stemming and lemmatization:</b>
    -   Stemming is a process of extracting or removing the last characters from a word, which often leads to incorrect meanings and spelling.
    -   Lemmatization takes context into account and converts the word into its meaningful base form, called Lemma.
<strong><b>Advantages :</b></strong>
-   Stemming is faster than lemmatization because it involves simple rule-based truncation of words.
-   It effectively reduces words to their root form, which can be useful for tasks where speed is crucial.
    But it can produce stems that are not valid words in the language, leading to a potential loss of meaning.
-   It does not always take into account the grammatical context of words, which can lead to inaccuracies.
-   On the other hand, Lemmatization produces valid words in the language by mapping words to their dictionary form, thus ensuring correctness.
-   It takes in account the grammatical context of words, leading to more meaningful results.

# rule-based POS tagging using regular expressions:

In [3]:
import re
import pymongo

def pos_tagging_rule_based_arabic(text):
    # we define patterns for different parts of speech
    patterns = {
        'noun': r'ال?[^\s]+ة?\b',
        'verb': r'(?:\bي[^\s]+|يت[^\s]+|ت[^\s]+|است[^\s]+|س[^\s]+|تن[^\s]+)\b',
        'adjective': r'(?:\bال?[^\s]+ي[^\s]+|ال?[^\s]+ة\b)'
    }

    # Perform POS tagging using regular expressions
    pos_tags = []
    for word in text.split():
        for pos, pattern in patterns.items():
            if re.match(pattern, word):
                pos_tags.append((word, pos))
                break
        else:
            pos_tags.append((word, 'other')) 
    return pos_tags

client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['arabic_news']  
collection = db['raw_data_1'] 

texts = [document['paragraph'] for document in collection.find()]

i = 0
# Apply POS tagging on each text
for text in texts:
    if i == 5:
        break 
    pos_tags = pos_tagging_rule_based_arabic(text)
    print("POS tags for the text:")
    print(pos_tags)
    i += 1

POS tags for the text:
[('الوِلَايَات', 'noun'), ('المُتَّحِدَة', 'noun'), ('الأَمرِيكِيَّة', 'noun'), ('واختصارًا', 'other'), ('أمريكا', 'other'), ('بالإنجليزية', 'other'), ('يونايتد', 'verb'), ('ستيتس', 'verb'), ('أوف', 'other'), ('أميركا،', 'other'), ('وهِي', 'other'), ('جُمهُورِيّة', 'other'), ('دُستُورِيّة', 'other'), ('اِتِّحادِيّة', 'noun'), ('تضمُّ', 'verb'), ('خمسِين', 'other'), ('وِلاية', 'other'), ('ومِنطقة', 'other'), ('العاصِمة', 'noun'), ('الاتّحادية', 'noun'), ('تقع', 'verb'), ('مُعظم', 'other'), ('البِلادِ', 'noun'), ('في', 'other'), ('وسط', 'other'), ('أَمريكا', 'other'), ('الشمالِيَّة،', 'noun'), ('حيثُ', 'other'), ('تقع', 'verb'), ('وِلاية،', 'other'), ('ووَاشِنطُن', 'other'), ('العاصِمة', 'noun'), ('بين', 'other'), ('المُحِيطُ', 'noun'), ('الهادِئ،', 'noun'), ('والمُحِيطُ', 'other'), ('الأطلسي،', 'noun'), ('وتحُدُّها', 'other'), ('كندا', 'other'), ('شمالًا', 'other'), ('والمَكْسِيك', 'other'), ('جنُوبًا', 'other'), ('تقع', 'verb'), ('وِلاية', 'other'), ('أَلاسْكا', 

#  Train a POS tagger Using stanza library (Machine learning approach):

In [4]:
import pymongo
import stanza

client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['arabic_news']  
collection = db['raw_data_1']  

texts_from_db = [doc['paragraph'] for doc in collection.find()]

def pos_tagging_stanza(texts):
    nlp = stanza.Pipeline(lang='ar', processors='tokenize,pos')

    pos_tags_list = []
    for text in texts:
        doc = nlp(text)
        
        pos_tags = [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
        pos_tags_list.append(pos_tags)

    return pos_tags_list

pos_tags_stanza = pos_tagging_stanza(texts_from_db)

j = 0
for i, text_pos_tags in enumerate(pos_tags_stanza):
    if j == 5:
        break
    print(f"POS Tags for Text {i+1}:")
    print(text_pos_tags)
    j += 1

2024-04-09 17:21:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-09 17:21:49 INFO: Downloaded file to C:\Users\user\stanza_resources\resources.json
2024-04-09 17:21:53 INFO: Loading these models for language: ar (Arabic):
| Processor | Package     |
---------------------------
| tokenize  | padt        |
| mwt       | padt        |
| pos       | padt_charlm |

2024-04-09 17:21:53 INFO: Using device: cpu
2024-04-09 17:21:53 INFO: Loading: tokenize
2024-04-09 17:22:10 INFO: Loading: mwt
2024-04-09 17:22:10 INFO: Loading: pos
2024-04-09 17:22:12 INFO: Done loading processors!


POS Tags for Text 1:
[('الوِلَايَات', 'X'), ('المُتَّحِدَة', 'ADJ'), ('الأَمرِيكِيَّة', 'ADJ'), ('و', 'CCONJ'), ('اختصارً', 'NOUN'), ('أمريكا', 'X'), ('ب', 'ADP'), ('الإنجليزية', 'NOUN'), ('يونايتد', 'X'), ('س', 'X'), ('تيتس', 'X'), ('أوف', 'X'), ('أمير', 'X'), ('كا', 'X'), ('،', 'PUNCT'), ('و', 'CCONJ'), ('ه<UNK>ي', 'NOUN'), ('جُمهُورِيّة', 'X'), ('دُستُورِيّة', 'X'), ('اِتِّحادِيّة', 'X'), ('تضمُّ', 'VERB'), ('خمسِين', 'NOUN'), ('و', 'CCONJ'), ('<UNK>لاية', 'NOUN'), ('و', 'CCONJ'), ('م<UNK>نطقة', 'NOUN'), ('العاصِمة', 'X'), ('الاتّحادية', 'ADJ'), ('تقع', 'VERB'), ('مُعظم', 'NOUN'), ('البِلادِ', 'NOUN'), ('في', 'ADP'), ('وسط', 'NOUN'), ('أَمريكا', 'X'), ('الشمالِيَّة', 'ADJ'), ('،', 'PUNCT'), ('حيثُ', 'CCONJ'), ('تقع', 'VERB'), ('وِلاية', 'X'), ('،', 'PUNCT'), ('و', 'CCONJ'), ('وَاش<UNK>نطُ', 'NOUN'), ('العاصِمة', 'X'), ('بين', 'ADP'), ('المُحِيطُ', 'NOUN'), ('الهادِئ', 'ADJ'), ('،', 'PUNCT'), ('و', 'CCONJ'), ('المح<UNK><UNK>يط', 'NOUN'), ('الأطلسي', 'X'), ('،', 'PUNCT'), ('و', 'CCONJ

# Named Entity Recognition (NER) : using Flair library

In [5]:
from flair.models import SequenceTagger
from flair.data import Sentence
import pymongo

def ner_tagging_arabic(text):
    # Load the pre-trained Flair NER model for Arabic
    tagger = SequenceTagger.load('megantosh/flair-arabic-multi-ner')

    sentence = Sentence(text)

    tagger.predict(sentence)

    ner_tags = [(entity.text, entity.labels[0].value) for entity in sentence.get_spans('ner')]

    return ner_tags

client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['arabic_news'] 
collection = db['raw_data_1']  


cursor = collection.find({}, {"_id": 0, "paragraph": 1}).limit(8) 
texts_from_mongodb = [doc['paragraph'] for doc in cursor]

i = 0
# Apply NER tagging on each text
for text in texts_from_mongodb:
    if i == 5:
        break
    ner_tags_arabic = ner_tagging_arabic(text)
    print(ner_tags_arabic)
    i += 1


2024-04-09 17:29:15,910 SequenceTagger predicts: Dictionary with 15 tags: <unk>, O, B-PER, I-PER, B-MISC, I-MISC, B-ORG, I-ORG, B-LOC, I-LOC, B-SPANISH, B-ENGLISH, , <START>, <STOP>
[('أمريكا', 'LOC'), ('يونايتد ستيتس', 'PER'), ('أَمريكا', 'LOC'), ('ووَاشِنطُن', 'LOC'), ('كندا', 'LOC'), ('أَلاسْكا', 'LOC'), ('كندا', 'LOC'), ('بيِرينغ،', 'LOC'), ('هاواي،', 'LOC'), ('الكارِيبِي', 'LOC')]
2024-04-09 17:29:30,232 SequenceTagger predicts: Dictionary with 15 tags: <unk>, O, B-PER, I-PER, B-MISC, I-MISC, B-ORG, I-ORG, B-LOC, I-LOC, B-SPANISH, B-ENGLISH, , <START>, <STOP>
[]
2024-04-09 17:30:11,173 SequenceTagger predicts: Dictionary with 15 tags: <unk>, O, B-PER, I-PER, B-MISC, I-MISC, B-ORG, I-ORG, B-LOC, I-LOC, B-SPANISH, B-ENGLISH, , <START>, <STOP>
[('بيِرينغ', 'LOC'), ('آسيَا', 'LOC'), ('أَلاسْكا', 'LOC'), ('للولايات المتحدة', 'LOC'), ('إليزابيث الأولى،', 'PER')]
2024-04-09 17:30:33,766 SequenceTagger predicts: Dictionary with 15 tags: <unk>, O, B-PER, I-PER, B-MISC, I-MISC, B-ORG, I-ORG

## lab's synthesis

This lab provided valuable hands-on experience in natural language processing techniques, including text preprocessing, POS tagging, NER, and database integration.
we delved into the intricate world of natural language processing (NLP), exploring various techniques to analyze and extract insights from textual data. We began by learning how to retrieve data from web sources using libraries like Scrapy and Beautiful Soup, storing the data efficiently in a MongoDB database. Next, we tackled the crucial step of text preprocessing, where we cleaned and prepared the data by removing noise, tokenizing, and applying techniques like stemming and lemmatization. Armed with preprocessed text, we ventured into the realms of Part of Speech (POS) tagging, experimenting with both rule-based and machine learning approaches to annotate words with their respective grammatical categories. Additionally, we ventured into Named Entity Recognition (NER), employing powerful models to identify and classify entities within the text.