In [1]:
import requests
import textstat # To compute readability scores (Flesch-Kincaid Grade Level)


In [6]:
def fetch_category_articles(category, limit=500):
    """
    Fetch articles from a specific Wikipedia category using the API.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&cmlimit={limit}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        # Extract page IDs and titles
        pages = data['query']['categorymembers']
        # Collect the page details in a list
        page_info = []
        for page in pages:
            page_info.append(f"Page ID: {page['pageid']}, Title: {page['title']}")
        return page_info  # Return the list of page details
    else:
        print(f"Failed to fetch pages for category {category}: {response.status_code}")
        return []
def fetch_page_content(pageid):
    """
    Fetch content of a Wikipedia page using the pageid.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={pageid}&prop=extracts&explaintext&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        page = data['query']['pages'][str(pageid)]
        return page.get('extract', "No content available")
    else:
        return f"Failed to fetch content: {response.status_code}"
# Categories to fetch
categories = ["Medicine", "Medical_specialties", "Health_sciences", "Human_anatomy", "Natural_sciences", "Social_sciences", "Psychology"]
# Loop through each category and fetch article details
for category in categories:
    print(f"\nFetching articles for category: {category}")
    articles = fetch_category_articles(category)
    if articles:  # Only proceed if articles were fetched successfully
        for article in articles:
            print(article)
    else:
        print(f"No articles found for category: {category}")



Fetching articles for category: Medicine
Page ID: 18957, Title: Medicine
Page ID: 14471564, Title: Outline of medicine
Page ID: 60606212, Title: Terminology of alternative medicine
Page ID: 23607241, Title: Anti-aging medicine
Page ID: 75840465, Title: Anti-asthmatic agent
Page ID: 73402513, Title: Breastmilk medicine
Page ID: 78136945, Title: Cancer exodus hypothesis
Page ID: 37019670, Title: Clinical handover
Page ID: 76454282, Title: Confocal endoscopy
Page ID: 75164769, Title: Diabetes self-management
Page ID: 76439689, Title: Dorsal pancreatic agenesis
Page ID: 78013627, Title: Drone-Enhanced Emergency Medical Services
Page ID: 1966031, Title: Isotropic bands
Page ID: 77353600, Title: LAMA2 related congenital muscular dystrophy
Page ID: 288156, Title: List of forms of alternative medicine
Page ID: 77815981, Title: LY-2365109
Page ID: 1173670, Title: Pediatric endocrinology
Page ID: 78237247, Title: Poison exon
Page ID: 77333746, Title: RNU4-2 syndrome
Page ID: 76556199, Title: Ur

### save in csv file


In [7]:
import os
import csv
import requests

def fetch_category_articles(category, limit=500):
    """
    Fetch articles from a specific Wikipedia category using the API.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&cmlimit={limit}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        # Extract page IDs and titles
        pages = data['query']['categorymembers']
        # Collect the page details in a list
        page_info = []
        for page in pages:
            page_info.append({'pageid': page['pageid'], 'title': page['title']})
        return page_info  # Return the list of page details
    else:
        print(f"Failed to fetch pages for category {category}: {response.status_code}")
        return []

def fetch_page_content(pageid):
    """
    Fetch content of a Wikipedia page using the pageid.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={pageid}&prop=extracts&explaintext&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        page = data['query']['pages'][str(pageid)]
        return page.get('extract', "No content available")
    else:
        return f"Failed to fetch content: {response.status_code}"

def sanitize_filename(title):
    """
    Sanitize the article title to create a valid filename.
    """
    # Remove or replace invalid characters in filenames (e.g., slashes, colons)
    invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
    for char in invalid_chars:
        title = title.replace(char, '_')
    return title

def save_article_to_csv(article, folder="data/original_data"):
    """
    Save a single article's title and content to a CSV file.
    """
    # Ensure the folder exists, if not create it
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    # Sanitize the article title for the filename
    filename = sanitize_filename(article['title']) + ".csv"
    file_path = os.path.join(folder, filename)
    
    # Write the article content to the CSV file
    with open(file_path, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Title", "Content"])  # Write header
        writer.writerow([article['title'], article['content']])  # Write title and content

# Categories to fetch
categories = ["Medicine", "Medical_specialties", "Health_sciences", "Human_anatomy", "Natural_sciences", "Social_sciences", "Psychology"]

# Loop through each category and fetch article details
for category in categories:
    print(f"\nFetching articles for category: {category}")
    articles = fetch_category_articles(category)
    
    # Only proceed if articles were fetched successfully
    if articles:
        for article in articles:
            print(f"Fetching content for article: {article['title']}")
            content = fetch_page_content(article['pageid'])
            article_data = {'title': article['title'], 'content': content}
            save_article_to_csv(article_data, folder="data/original_data")
            print(f"Article '{article['title']}' saved to CSV.")
    else:
        print(f"No articles found for category: {category}")




Fetching articles for category: Medicine
Fetching content for article: Medicine
Article 'Medicine' saved to CSV.
Fetching content for article: Outline of medicine
Article 'Outline of medicine' saved to CSV.
Fetching content for article: Terminology of alternative medicine
Article 'Terminology of alternative medicine' saved to CSV.
Fetching content for article: Anti-aging medicine
Article 'Anti-aging medicine' saved to CSV.
Fetching content for article: Anti-asthmatic agent
Article 'Anti-asthmatic agent' saved to CSV.
Fetching content for article: Breastmilk medicine
Article 'Breastmilk medicine' saved to CSV.
Fetching content for article: Cancer exodus hypothesis
Article 'Cancer exodus hypothesis' saved to CSV.
Fetching content for article: Clinical handover
Article 'Clinical handover' saved to CSV.
Fetching content for article: Confocal endoscopy
Article 'Confocal endoscopy' saved to CSV.
Fetching content for article: Diabetes self-management
Article 'Diabetes self-management' saved 

In [None]:
import os
import csv
import requests

def fetch_category_articles(category, limit=500):
    """
    Fetch articles from a specific Wikipedia category using the API.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&cmlimit={limit}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        # Extract page IDs and titles
        pages = data['query']['categorymembers']
        # Collect the page details in a list
        page_info = []
        for page in pages:
            page_info.append({'pageid': page['pageid'], 'title': page['title']})
        return page_info  # Return the list of page details
    else:
        print(f"Failed to fetch pages for category {category}: {response.status_code}")
        return []

def fetch_page_content(pageid):
    """
    Fetch content of a Wikipedia page using the pageid.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={pageid}&prop=extracts&explaintext&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        page = data['query']['pages'][str(pageid)]
        return page.get('extract', "No content available")
    else:
        return f"Failed to fetch content: {response.status_code}"

def sanitize_filename(title):
    """
    Sanitize the article title to create a valid filename.
    """
    # Remove or replace invalid characters in filenames (e.g., slashes, colons)
    invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
    for char in invalid_chars:
        title = title.replace(char, '_')
    return title

def save_article_to_csv(article, folder="data"):
    """
    Save a single article's title and content to a CSV file.
    """
    # Ensure the folder exists, if not create it
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    # Sanitize the article title for the filename
    filename = sanitize_filename(article['title']) + ".csv"
    file_path = os.path.join(folder, filename)
    
    # Write the article content to the CSV file
    with open(file_path, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Title", "Content"])  # Write header
        writer.writerow([article['title'], article['content']])  # Write title and content

# Categories to fetch
categories = ["Medicine", "Medical_specialties", "Health_sciences", "Human_anatomy", "Natural_sciences", "Social_sciences", "Psychology"]

# Loop through each category and fetch article details
for category in categories:
    print(f"\nFetching articles for category: {category}")
    articles = fetch_category_articles(category)
    
    # Only proceed if articles were fetched successfully
    if articles:
        for article in articles:
            print(f"Fetching content for article: {article['title']}")
            content = fetch_page_content(article['pageid'])
            article_data = {'title': article['title'], 'content': content}
            save_article_to_csv(article_data, folder="data")
            print(f"Article '{article['title']}' saved to CSV.")
    else:
        print(f"No articles found for category: {category}")



# 1. Text Preprocessing
- remove irrelevant section like refrence, external links





In [2]:
# Extract Relationships from Text Using Dependency Parsing (for GNNs)



In [3]:
# Generate Logical Multiple-Choice Questions Using BERT


In [4]:

# Create MCQs (Multiple-Choice Questions)


In [5]:

# Save Questions and MCQs to CSV


In [None]:






 # To compute readability scores (Flesch-Kincaid Grade Level)

def summarize_and_categorize(content):
    """
    Summarize the article content and categorize based on readability.
    """
    # Get the readability score (Flesch-Kincaid Grade Level)
    grade_level = textstat.flesch_kincaid_grade(content)
    # Simplified categorization based on the Flesch-Kincaid Grade Level
    if grade_level <= 6:
        category = 'Easy'
    elif 6 < grade_level <= 12:
        category = 'Medium'
    else:
        category = 'Hard'
    # Simplified summary: just take the first 3 sentences as a basic summary
    sentences = content.split('.')
    summary = '. '.join(sentences[:3]) + '.' if len(sentences) > 3 else content
    return summary, category
# Categories to fetch
categories = ["Medicine", "Medical_specialties", "Health_sciences", "Human_anatomy", "Natural_sciences", "Social_sciences", "Psychology"]
# Loop through each category and fetch article details
for category in categories:
    print(f"\nFetching articles for category: {category}")
    articles = fetch_category_articles(category)
    if articles:  # Only proceed if articles were fetched successfully
        for article in articles:
            title = article['title']
            page_id = article['pageid']
            print(f"\nProcessing article: {title}")
            content = fetch_page_content(page_id)
            summary, difficulty = summarize_and_categorize(content)
            # Output the result
            print(f"Summary: {summary}")
            print(f"Difficulty Level: {difficulty}")
    else:
        print(f"No articles found for category: {category}")