In [5]:
import requests
import textstat # To compute readability scores (Flesch-Kincaid Grade Level)


In [6]:
def fetch_category_articles(category, limit=500):
    """
    Fetch articles from a specific Wikipedia category using the API.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&cmlimit={limit}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        # Extract page IDs and titles
        pages = data['query']['categorymembers']
        # Collect the page details in a list
        page_info = []
        for page in pages:
            page_info.append(f"Page ID: {page['pageid']}, Title: {page['title']}")
        return page_info  # Return the list of page details
    else:
        print(f"Failed to fetch pages for category {category}: {response.status_code}")
        return []
def fetch_page_content(pageid):
    """
    Fetch content of a Wikipedia page using the pageid.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={pageid}&prop=extracts&explaintext&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        page = data['query']['pages'][str(pageid)]
        return page.get('extract', "No content available")
    else:
        return f"Failed to fetch content: {response.status_code}"
# Categories to fetch
categories = ["Medicine", "Medical_specialties", "Health_sciences", "Human_anatomy", "Natural_sciences", "Social_sciences", "Psychology"]
# Loop through each category and fetch article details
for category in categories:
    print(f"\nFetching articles for category: {category}")
    articles = fetch_category_articles(category)
    if articles:  # Only proceed if articles were fetched successfully
        for article in articles:
            print(article)
    else:
        print(f"No articles found for category: {category}")



Fetching articles for category: Medicine
Page ID: 18957, Title: Medicine
Page ID: 14471564, Title: Outline of medicine
Page ID: 60606212, Title: Terminology of alternative medicine
Page ID: 23607241, Title: Anti-aging medicine
Page ID: 75840465, Title: Anti-asthmatic agent
Page ID: 73402513, Title: Breastmilk medicine
Page ID: 78136945, Title: Cancer exodus hypothesis
Page ID: 37019670, Title: Clinical handover
Page ID: 76454282, Title: Confocal endoscopy
Page ID: 75164769, Title: Diabetes self-management
Page ID: 76439689, Title: Dorsal pancreatic agenesis
Page ID: 78013627, Title: Drone-Enhanced Emergency Medical Services
Page ID: 1966031, Title: Isotropic bands
Page ID: 77353600, Title: LAMA2 related congenital muscular dystrophy
Page ID: 288156, Title: List of forms of alternative medicine
Page ID: 77815981, Title: LY-2365109
Page ID: 1173670, Title: Pediatric endocrinology
Page ID: 78237247, Title: Poison exon
Page ID: 77333746, Title: RNU4-2 syndrome
Page ID: 76556199, Title: Ur

### save in csv file


In [8]:
import os
import csv
import requests

def sanitize_filename(title):
    """
    Sanitize the article title to create a valid filename.
    """
    invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
    for char in invalid_chars:
        title = title.replace(char, '_')
    return title

def save_article_to_csv(article, folder="data/original_data"):
    """
    Save a single article's title and content to a CSV file.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    filename = sanitize_filename(article['title']) + ".csv"
    file_path = os.path.join(folder, filename)
    
    with open(file_path, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Title", "Content"])  
        writer.writerow([article['title'], article['content']])  # Write title and content

def fetch_category_articles(category, limit=500):
    """
    Fetch articles from a specific Wikipedia category using the API.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&cmlimit={limit}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        pages = data['query']['categorymembers']
        page_info = []
        for page in pages:
            page_info.append({'pageid': page['pageid'], 'title': page['title']})
        return page_info  # Return a list of dictionaries with 'pageid' and 'title'
    else:
        print(f"Failed to fetch pages for category {category}: {response.status_code}")
        return []

def fetch_page_content(pageid):
    """
    Fetch content of a Wikipedia page using the pageid.
    """
    url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={pageid}&prop=extracts&explaintext&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        page = data['query']['pages'][str(pageid)]
        return page.get('extract', "No content available")
    else:
        return f"Failed to fetch content: {response.status_code}"

# Categories to fetch
categories = ["Medicine", "Medical_specialties", "Health_sciences", "Human_anatomy", "Natural_sciences", "Social_sciences", "Psychology"]

# Loop through each category and fetch article details
for category in categories:
    print(f"\nFetching articles for category: {category}")
    articles = fetch_category_articles(category)
    
    if articles:  # Only proceed if articles were fetched successfully
        for article in articles:
            print(f"Fetching content for article: {article['title']}")
            content = fetch_page_content(article['pageid'])
            article_data = {'title': article['title'], 'content': content}
            save_article_to_csv(article_data, folder="data/original_data")
            print(f"Article '{article['title']}' saved to CSV.")
    else:
        print(f"No articles found for category: {category}")



Fetching articles for category: Medicine
Fetching content for article: Medicine
Article 'Medicine' saved to CSV.
Fetching content for article: Outline of medicine
Article 'Outline of medicine' saved to CSV.
Fetching content for article: Terminology of alternative medicine
Article 'Terminology of alternative medicine' saved to CSV.
Fetching content for article: Anti-aging medicine
Article 'Anti-aging medicine' saved to CSV.
Fetching content for article: Anti-asthmatic agent
Article 'Anti-asthmatic agent' saved to CSV.
Fetching content for article: Breastmilk medicine
Article 'Breastmilk medicine' saved to CSV.
Fetching content for article: Cancer exodus hypothesis
Article 'Cancer exodus hypothesis' saved to CSV.
Fetching content for article: Clinical handover
Article 'Clinical handover' saved to CSV.
Fetching content for article: Confocal endoscopy
Article 'Confocal endoscopy' saved to CSV.
Fetching content for article: Diabetes self-management
Article 'Diabetes self-management' saved 

# 1. Text Preprocessing
- remove irrelevant section like refrence, external links





In [12]:
import os
import pandas as pd
import re

def clean_content(content):
    """
    Function to clean unwanted sections from the scraped Wikipedia content.
    """
    if not isinstance(content, str):
        return ""  # Return an empty string if content is not a string
    
    # Remove unwanted sections like 'See also', 'References', and 'External links'
    unwanted_patterns = [
        r"== See also ==.*",  # Remove 'See also' sections
        r"== References ==.*",  # Remove 'References' sections
        r"== External links ==.*",  # Remove 'External links' sections
        r"Media related to .* at Wikimedia Commons",  # Remove media-related phrases
        r"== Additional images ==.*", # Remove additional images
        r"== Further reading ==.*",  # Remove 'Further reading' sections
        r"== Notes ==.*",  # Remove 'Notes' sections
        r"== Bibliography ==.*",  # Remove 'Bibliography' sections
        r"== Citations ==.*"  # Remove 'Citations' sections
    ]

    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content, flags=re.DOTALL)
    
    # Additional cleaning: Remove excessive whitespace, line breaks, and non-alphanumeric characters
    content = re.sub(r'\s+', ' ', content)  # Replace multiple whitespaces with a single space
    content = re.sub(r'[^\w\s]', '', content)  # Remove non-alphanumeric characters (except spaces)
    
    # Return cleaned content
    return content.strip()

def clean_csv(input_file, output_file):
    """
    Read a CSV file, clean its content, and save the cleaned data to a new CSV file.
    """
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Check if the 'Content' column exists in the DataFrame
    if 'Content' not in df.columns:
        print(f"No 'Content' column found in {input_file}. Skipping this file.")
        return
    
    # Clean the 'Content' column
    df['Content'] = df['Content'].apply(clean_content)
    
    # Optionally, remove rows where content is empty after cleaning
    df = df[df['Content'].str.strip() != '']
    
    # Save the cleaned data to a new CSV
    df.to_csv(output_file, index=False)
    print(f"Cleaned CSV saved to: {output_file}")

def clean_all_csvs(input_folder, output_folder):
    """
    Clean all CSV files in the input folder and save cleaned files to the output folder.
    """
    # Check if output folder exists, if not create it
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over all CSV files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.csv'):
            input_file = os.path.join(input_folder, file_name)
            output_file = os.path.join(output_folder, file_name)
            
            # Clean the CSV file
            clean_csv(input_file, output_file)

# Define the input and output folders
input_folder = 'data/original_data'
output_folder = 'data/clean_data'

# Clean all CSV files in the original_data folder
clean_all_csvs(input_folder, output_folder)


Cleaned CSV saved to: data/clean_data\Abdomen.csv
Cleaned CSV saved to: data/clean_data\Achilles tendon.csv
Cleaned CSV saved to: data/clean_data\Adductor hiatus.csv
Cleaned CSV saved to: data/clean_data\Administrative history.csv
Cleaned CSV saved to: data/clean_data\Allergist.csv
Cleaned CSV saved to: data/clean_data\Alveolus.csv
Cleaned CSV saved to: data/clean_data\Amniotic sac.csv
Cleaned CSV saved to: data/clean_data\Anatomical terminology.csv
Cleaned CSV saved to: data/clean_data\Anatomical terms of bone.csv
Cleaned CSV saved to: data/clean_data\Anatomical terms of muscle.csv
Cleaned CSV saved to: data/clean_data\Anatomical terms of neuroanatomy.csv
Cleaned CSV saved to: data/clean_data\Anatomy Charts of the Arabs.csv
Cleaned CSV saved to: data/clean_data\Anatomy of human.csv
Cleaned CSV saved to: data/clean_data\Animal studies.csv
Cleaned CSV saved to: data/clean_data\Anorectal canal.csv
Cleaned CSV saved to: data/clean_data\Anthropocene Working Group.csv
Cleaned CSV saved to: 

In [13]:
import os
import pandas as pd
import shutil

def remove_empty_csvs_and_copy(input_folder, output_folder):
    """
    Remove empty CSV files (with only Title and Content columns and no data) from the input folder
    and copy the valid ones to the output folder.
    """
    # Check if output folder exists, if not create it
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over all CSV files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.csv'):
            input_file = os.path.join(input_folder, file_name)
            df = pd.read_csv(input_file)

            # Check if the CSV file contains only the 'Title' and 'Content' columns and has no data
            if df.empty or df['Content'].isnull().all() or df['Content'].str.strip().eq('').all():
                print(f"Deleting empty file: {input_file}")
                os.remove(input_file)  # Remove the file if it's empty or has no data
            else:
                # Copy the valid file to the output folder
                output_file = os.path.join(output_folder, file_name)
                shutil.copy(input_file, output_file)  # Copy instead of move
                print(f"Copied valid file to: {output_file}")

# Define the input and output folders
input_folder = 'data/clean_data'
output_folder = 'data/content_data'

# Remove empty CSVs and copy valid ones to the content_data folder
remove_empty_csvs_and_copy(input_folder, output_folder)


Copied valid file to: data/content_data\Abdomen.csv
Copied valid file to: data/content_data\Achilles tendon.csv
Copied valid file to: data/content_data\Adductor hiatus.csv
Copied valid file to: data/content_data\Administrative history.csv
Copied valid file to: data/content_data\Allergist.csv
Copied valid file to: data/content_data\Alveolus.csv
Copied valid file to: data/content_data\Amniotic sac.csv
Copied valid file to: data/content_data\Anatomical terminology.csv
Copied valid file to: data/content_data\Anatomical terms of bone.csv
Copied valid file to: data/content_data\Anatomical terms of muscle.csv
Copied valid file to: data/content_data\Anatomical terms of neuroanatomy.csv
Copied valid file to: data/content_data\Anatomy Charts of the Arabs.csv
Deleting empty file: data/clean_data\Anatomy of human.csv
Copied valid file to: data/content_data\Animal studies.csv
Copied valid file to: data/content_data\Anorectal canal.csv
Copied valid file to: data/content_data\Anthropocene Working Gro

# Summarize the content
To handle long text, we can:

- Split the content into smaller chunks.
- Summarize each chunk individually.
- Combine the summaries for each chunk into one final summary.

In [30]:
import os
import pandas as pd
from transformers import pipeline

# Initialize the Hugging Face summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def split_text_into_chunks(text, max_tokens=512):
    """
    Splits the text into chunks that fit within the token limit.
    """
    # Tokenize the text and split into chunks
    words = text.split()
    chunks = []
    current_chunk = []

    # Add words until we reach the max token limit
    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) > max_tokens:
            chunks.append(' '.join(current_chunk[:-1]))  # Add the chunk without the last word
            current_chunk = [current_chunk[-1]]  # Start a new chunk with the last word

    if current_chunk:  # Add any remaining chunk
        chunks.append(' '.join(current_chunk))

    return chunks



In [31]:
def summarize_text(text):
    """
    Summarizes the given text by splitting it into smaller chunks if necessary.
    Dynamically adjusts the max_length based on the length of the input.
    """
    # Calculate the length of the input text in terms of tokens/words
    input_length = len(text.split())
    
    # Set max_length to be at most 50% of input length, but ensure it's not too large
    max_summary_length = min(150, max(50, input_length // 2))  # Prevents too long summaries

    # If the text is too long (more than 512 words), split it into smaller chunks
    if input_length > 512:
        chunks = split_text_into_chunks(text)
        summaries = []
        for chunk in chunks:
            summary = summarizer(chunk, max_length=max_summary_length, min_length=50, do_sample=False)[0]['summary_text']
            summaries.append(summary)
        return ' '.join(summaries)  # Combine all chunk summaries
    else:
        # If text is short enough, summarize it directly
        return summarizer(text, max_length=max_summary_length, min_length=50, do_sample=False)[0]['summary_text']


In [None]:
def summarize_csv(input_folder, output_folder):
    """
    Read CSV files, apply summarization to the content, and save the summarized data to new CSV files.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through all CSV files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.csv'):
            input_file = os.path.join(input_folder, file_name)
            df = pd.read_csv(input_file)

            # Check if the 'Content' column exists
            if 'Content' in df.columns:
                # Apply summarization to each content entry
                df['Content'] = df['Content'].apply(summarize_text)
                
                # Save the summarized data to a new CSV
                output_file = os.path.join(output_folder, f"summarized_{file_name}")
                df.to_csv(output_file, index=False)
                print(f"Summarized CSV saved to: {output_file}")
            else:
                print(f"No 'Content' column in {input_file}. Skipping.")

# Define input and output folders
input_folder = 'data/content_data'  # Folder with content_data
output_folder = 'data/summarized_data'  # Folder where summarized data will be stored

# Summarize all content in the CSV files
summarize_csv(input_folder, output_folder)


Your max_length is set to 150, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


Your max_length is set to 150, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 150, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 150, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 150, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)
Y

Summarized CSV saved to: data/summarized_data\summarized_Abdomen.csv


Your max_length is set to 150, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 150, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 150, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 150, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Y

Summarized CSV saved to: data/summarized_data\summarized_Achilles tendon.csv
Summarized CSV saved to: data/summarized_data\summarized_Adductor hiatus.csv
Summarized CSV saved to: data/summarized_data\summarized_Administrative history.csv
Summarized CSV saved to: data/summarized_data\summarized_Allergist.csv


Your max_length is set to 150, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


Summarized CSV saved to: data/summarized_data\summarized_Alveolus.csv


Your max_length is set to 150, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Your max_length is set to 150, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 150, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 150, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Summarized CSV saved to: data/summarized_data\summarized_Amniotic sac.csv


Your max_length is set to 150, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 150, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 150, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Your max_length is set to 150, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Yo

In [None]:
# Generate Logical Multiple-Choice Questions Using BERT


In [None]:

# Create MCQs (Multiple-Choice Questions)


In [None]:

# Save Questions and MCQs to CSV


In [None]:






 # To compute readability scores (Flesch-Kincaid Grade Level)

def summarize_and_categorize(content):
    """
    Summarize the article content and categorize based on readability.
    """
    # Get the readability score (Flesch-Kincaid Grade Level)
    grade_level = textstat.flesch_kincaid_grade(content)
    # Simplified categorization based on the Flesch-Kincaid Grade Level
    if grade_level <= 6:
        category = 'Easy'
    elif 6 < grade_level <= 12:
        category = 'Medium'
    else:
        category = 'Hard'
    # Simplified summary: just take the first 3 sentences as a basic summary
    sentences = content.split('.')
    summary = '. '.join(sentences[:3]) + '.' if len(sentences) > 3 else content
    return summary, category
# Categories to fetch
categories = ["Medicine", "Medical_specialties", "Health_sciences", "Human_anatomy", "Natural_sciences", "Social_sciences", "Psychology"]
# Loop through each category and fetch article details
for category in categories:
    print(f"\nFetching articles for category: {category}")
    articles = fetch_category_articles(category)
    if articles:  # Only proceed if articles were fetched successfully
        for article in articles:
            title = article['title']
            page_id = article['pageid']
            print(f"\nProcessing article: {title}")
            content = fetch_page_content(page_id)
            summary, difficulty = summarize_and_categorize(content)
            # Output the result
            print(f"Summary: {summary}")
            print(f"Difficulty Level: {difficulty}")
    else:
        print(f"No articles found for category: {category}")