In [48]:
from bs4 import BeautifulSoup
import requests
import csv
from gensim.models import FastText
import random

In [49]:
base_url = 'https://fatabyyano.net'
urls = [base_url] + [f'{base_url}/page/{i}/' for i in range(2, 179)]

In [50]:
data = [
    ["سياسة", "انتخابات", "حكومة", "قانون", "سياسي", "ديمقراطية", "قرار", "رئيس", "برلمان", "تصويت"],
    ["رياضة", "كرة القدم", "تنس", "رياضي", "بطولة", "فريق", "مباراة", "مدرب", "لعبة", "جائزة"],
    ["صحة", "طب", "مرض", "علاج", "مستشفى", "دواء", "وباء", "نفسي", "صحي", "تغذية"],
    ["تكنولوجيا", "ابتكار", "تقنية", "برمجة", "إنترنت", "جهاز", "تطبيق", "شبكة", "ذكاء اصطناعي", "تحديث"]
]

In [51]:
flattened_data = [word for sublist in data for word in sublist]
random.shuffle(flattened_data)
model = FastText(sentences=data, window=5, min_count=1, workers=4, sg=1)
model.save('fasttext_model.model')

In [52]:
def scrape_page(url):
    page = requests.get(url)
    src = page.content
    soup = BeautifulSoup(src, 'lxml')
    news_details = []
    newsCards = soup.find_all("div", {'class': 'vc_col-sm-8 wpb_column vc_column_container'})
    
    for newsCard in newsCards:
        all_news = newsCard.contents[0].find_all("article")
        fake_keywords = ["مضلل", "زائف", "زائف جزئي"]
        for news in all_news:
            title_news = news.find('h2').text.strip()
            time_news = news.find('time').text.strip()
            span_news = news.find('span').text.strip()

            classification = "fake" if any(keyword in span_news for keyword in fake_keywords) else "real"
            
            # Predict topic using FastText model
            topic = predict_topic(title_news)

            news_details.append({
                "title": title_news,
                "time": time_news,
                "fake or real": classification,
                "topic": topic
            })
    
    return news_details

In [53]:
def predict_topic(text):
    text_vector = sum(model.wv[word] for word in text.split() if word in model.wv) / len(text.split())
    topic, _ = model.wv.similar_by_vector(text_vector, topn=1)[0]
    return topic

In [54]:
all_news_details = []
for url in urls:
    print(url)
    all_news_details.extend(scrape_page(url))

https://fatabyyano.net
https://fatabyyano.net/page/2/
https://fatabyyano.net/page/3/
https://fatabyyano.net/page/4/
https://fatabyyano.net/page/5/
https://fatabyyano.net/page/6/
https://fatabyyano.net/page/7/
https://fatabyyano.net/page/8/
https://fatabyyano.net/page/9/
https://fatabyyano.net/page/10/
https://fatabyyano.net/page/11/
https://fatabyyano.net/page/12/
https://fatabyyano.net/page/13/
https://fatabyyano.net/page/14/
https://fatabyyano.net/page/15/
https://fatabyyano.net/page/16/
https://fatabyyano.net/page/17/
https://fatabyyano.net/page/18/
https://fatabyyano.net/page/19/
https://fatabyyano.net/page/20/
https://fatabyyano.net/page/21/
https://fatabyyano.net/page/22/
https://fatabyyano.net/page/23/
https://fatabyyano.net/page/24/
https://fatabyyano.net/page/25/
https://fatabyyano.net/page/26/
https://fatabyyano.net/page/27/
https://fatabyyano.net/page/28/
https://fatabyyano.net/page/29/
https://fatabyyano.net/page/30/
https://fatabyyano.net/page/31/
https://fatabyyano.net/pa

In [55]:
keys = all_news_details[0].keys()
with open('/home/kawther/Documents/S2/NLP/new/BS/datasets/fatabyyano.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(all_news_details)
    print("File created successfully.")

File created successfully.
