In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import random
import os


In [2]:
def get_wikipedia_content(topic):
    """
    Fetch content from Wikipedia on a specific topic
    """
    url = f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get all paragraphs from the main content
    paragraphs = soup.select('#mw-content-text p')
    
    # Extract and clean text
    cleaned_paragraphs = []
    for p in paragraphs:
        text = p.get_text()
        # Remove citations [1], [2], etc.
        text = re.sub(r'\[\d+\]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Filter out short paragraphs, empty ones, or those that are too long
        word_count = len(text.split())
        if 100 <= word_count <= 120:
            cleaned_paragraphs.append({
                'text': text,
                'word_count': word_count,
                'source': url,
                'topic': topic
            })
    
    return cleaned_paragraphs

In [3]:
def get_science_health_nutrition_topics():
    """
    Return a list of topics related to science, health and nutrition
    """
    science_topics = [
        'Biology', 'Chemistry', 'Physics', 'Astronomy', 'Neuroscience',
        'Quantum_mechanics', 'Genetics', 'Microbiology', 'Cell_biology',
        'Ecology', 'Evolution', 'Climate_change', 'Biotechnology',
        'Artificial_intelligence', 'Machine_learning', 'Computer_science',
        'Immunology', 'Geology', 'Oceanography', 'Mathematics'
    ]
    
    health_topics = [
        'Medicine', 'Public_health', 'Epidemiology', 'Mental_health',
        'Physical_fitness', 'Disease', 'Cancer', 'Diabetes', 'Heart_disease',
        'Vaccine', 'Pandemic', 'Preventive_healthcare', 'Surgery',
        'Telemedicine', 'Healthcare', 'Medical_research', 'Exercise',
        'Sleep', 'Stress_management', 'Wellness'
    ]
    
    nutrition_topics = [
        'Nutrition', 'Diet_(nutrition)', 'Mediterranean_diet', 'Vitamin',
        'Mineral_(nutrient)', 'Protein', 'Carbohydrate', 'Fat', 'Dietary_fiber',
        'Antioxidant', 'Food_group', 'Calorie', 'Malnutrition', 'Metabolic_health',
        'Vegetarianism', 'Veganism', 'Whole_food', 'Gut_microbiome', 'Food_science',
        'Nutritional_supplement'
    ]
    
    # Also include subcategories and related topics
    extended_topics = [
        'Molecular_biology', 'Physiology', 'Anatomy', 'Pharmacology',
        'Cardiovascular_system', 'Immune_system', 'Digestive_system',
        'Endocrine_system', 'Metabolism', 'Nutrigenomics', 'Plant-based_diet',
        'Functional_food', 'Probiotics', 'Intermittent_fasting', 'Ketogenic_diet',
        'Micronutrient', 'Macronutrient', 'Food_pyramid_(nutrition)',
        'Nutritional_epidemiology', 'Biochemistry', 'Organic_chemistry'
    ]

    # Additional Science Topics
    additional_science_topics = [
        'Astrophysics', 'Cosmology', 'String_theory', 'Particle_physics', 'Nuclear_physics',
        'Thermodynamics', 'Fluid_dynamics', 'Relativity', 'Optics', 'Acoustics',
        'Materials_science', 'Nanotechnology', 'Robotics', 'Cryptography', 'Quantum_computing',
        'Systems_biology', 'Computational_biology', 'Bioinformatics', 'Genomics', 'Proteomics',
        'Developmental_biology', 'Marine_biology', 'Zoology', 'Botany', 'Mycology',
        'Paleontology', 'Archaeology', 'Anthropology', 'Seismology', 'Mineralogy',
        'Plate_tectonics', 'Volcanology', 'Meteorology', 'Atmospheric_science', 'Environmental_science',
        'Renewable_energy', 'Nuclear_energy', 'Data_science', 'Network_theory', 'Graph_theory',
        'Statistics', 'Linear_algebra', 'Calculus', 'Number_theory', 'Topology',
        'Chaos_theory', 'Complexity_theory', 'Cybernetics', 'Systems_theory', 'Information_theory'
    ]

    # Additional Health Topics
    additional_health_topics = [
        'Neurology', 'Cardiology', 'Oncology', 'Pediatrics', 'Geriatrics',
        'Orthopedics', 'Dermatology', 'Ophthalmology', 'Psychiatry', 'Psychology',
        'Obstetrics', 'Gynecology', 'Urology', 'Nephrology', 'Endocrinology',
        'Gastroenterology', 'Pulmonology', 'Rheumatology', 'Hematology', 'Infectious_disease',
        'Autoimmune_disease', 'Allergy', 'Medical_imaging', 'Laboratory_medicine', 'Pathology',
        'Emergency_medicine', 'Intensive_care', 'Palliative_care', 'Rehabilitation_medicine', 'Sports_medicine',
        'Occupational_health', 'Environmental_health', 'Global_health', 'Health_policy', 'Health_economics',
        'Health_informatics', 'Telehealth', 'Personalized_medicine', 'Pharmacogenomics', 'Gene_therapy',
        'Stem_cell_therapy', 'Organ_transplantation', 'Artificial_organs', 'Prosthetics', 'Biomedical_engineering',
        'Health_psychology', 'Cognitive_behavioral_therapy', 'Mindfulness', 'Health_education', 'Health_literacy'
    ]

    # Additional Nutrition Topics
    additional_nutrition_topics = [
        'Nutrition_psychology', 'Dietary_patterns', 'Blue_zone_diet', 'DASH_diet', 'Low-carbohydrate_diet',
        'Gluten-free_diet', 'Food_allergy', 'Food_intolerance', 'Celiac_disease', 'Nutritional_deficiency',
        'Vitamin_D', 'Omega-3_fatty_acid', 'Essential_amino_acid', 'Branched-chain_amino_acid', 'Phytonutrient',
        'Polyphenol', 'Flavonoid', 'Carotenoid', 'Isoflavone', 'Lycopene',
        'Resveratrol', 'Curcumin', 'Fermented_food', 'Prebiotic', 'Dietary_supplement',
        'Sports_nutrition', 'Hydration', 'Electrolyte', 'Glycemic_index', 'Glycemic_load',
        'Insulin_resistance', 'Metabolic_syndrome', 'Nutritional_therapy', 'Nutrition_during_pregnancy', 'Pediatric_nutrition',
        'Geriatric_nutrition', 'Enteral_nutrition', 'Parenteral_nutrition', 'Food_safety', 'Food_processing',
        'Culinary_medicine', 'Food_as_medicine', 'Traditional_dietary_practices', 'Cultural_food_practices', 'Sustainable_nutrition'
    ]

    # Additional Extended Topics
    additional_extended_topics = [
        'Epigenetics', 'Circadian_rhythm', 'Chronobiology', 'Human_microbiome', 'Neuroplasticity',
        'Psychoneuroimmunology', 'Behavioral_medicine', 'Biofeedback', 'Neurogenesis', 'Synaptic_plasticity',
        'Regenerative_medicine', 'Precision_medicine', 'Digital_health', 'Health_wearables', 'Brain-computer_interface',
        'Medical_nanotechnology', 'Health_applications_of_AI', 'Virtual_reality_in_healthcare', 'Computational_medicine', 'Systems_medicine',
        'Network_medicine', 'Exposome', 'Social_determinants_of_health', 'Planetary_health', 'One_Health',
        'Zoonotic_disease', 'Antimicrobial_resistance', 'Biosecurity', 'Health_resilience', 'Disaster_medicine',
        'Trauma_medicine', 'Addiction_medicine', 'Pain_management', 'Sleep_medicine', 'Autonomic_nervous_system',
        'Neuroendocrinology', 'Psychosomatic_medicine', 'Nutritional_genomics', 'Nutriepigenomics', 'Medicinal_chemistry',
        'Pharmaceutical_biotechnology', 'Drug_delivery_systems', 'Nanomedicine', 'Translational_research', 'Health_services_research'
    ]
    
    # Combine all topics
    all_topics = science_topics + health_topics + nutrition_topics + extended_topics
    all_additional_topics = additional_science_topics + additional_health_topics + additional_nutrition_topics + additional_extended_topics
    return all_additional_topics

In [4]:
def main():
    topics = get_science_health_nutrition_topics()
    all_paragraphs = []
    target_count = 500
    
    # Shuffle topics to get diverse content
    random.shuffle(topics)
    
    print(f"Starting to collect paragraphs from {len(topics)} topics...")
    
    for topic in topics:
        if len(all_paragraphs) >= target_count:
            break
            
        print(f"Fetching content for: {topic}")
        paragraphs = get_wikipedia_content(topic)
        print(f"Found {len(paragraphs)} suitable paragraphs (100-120 words)")
        
        all_paragraphs.extend(paragraphs)
        
        # Respect Wikipedia's servers by adding a delay
        time.sleep(2)
        
        # Status update
        print(f"Total paragraphs collected so far: {len(all_paragraphs)}/{target_count}")
    
    # Create dataset
    df = pd.DataFrame(all_paragraphs)
    
    filename = 'wikipedia_dataset.csv'
    file_exists = os.path.isfile(filename)
    
    if file_exists:
        # If file exists, append without writing the header
        print(f"Appending {len(df)} entries to existing file {filename}")
        df.to_csv(filename, mode='a', header=False, index=False)
    else:
        # If file doesn't exist, create it with header
        print(f"Creating new file {filename} with {len(df)} entries")
        df.to_csv(filename, index=False)
    
    # Show sample statistics
    print("\nWord count statistics:")
    print(f"Average: {df['word_count'].mean():.2f} words")
    print(f"Min: {df['word_count'].min()} words")
    print(f"Max: {df['word_count'].max()} words")
    
    # Distribution of topics
    print("\nTopic distribution:")
    topic_counts = df['topic'].value_counts()
    for topic, count in topic_counts.items():
        print(f"{topic}: {count} paragraphs")

In [5]:
if __name__ == "__main__":
    main()

Starting to collect paragraphs from 190 topics...
Fetching content for: Nanotechnology
Found 5 suitable paragraphs (100-120 words)
Total paragraphs collected so far: 5/500
Fetching content for: Food_safety
Found 7 suitable paragraphs (100-120 words)
Total paragraphs collected so far: 12/500
Fetching content for: Intensive_care
Found 2 suitable paragraphs (100-120 words)
Total paragraphs collected so far: 14/500
Fetching content for: Branched-chain_amino_acid
Found 0 suitable paragraphs (100-120 words)
Total paragraphs collected so far: 14/500
Fetching content for: Dermatology
Found 2 suitable paragraphs (100-120 words)
Total paragraphs collected so far: 16/500
Fetching content for: Medical_imaging
Found 9 suitable paragraphs (100-120 words)
Total paragraphs collected so far: 25/500
Fetching content for: Pediatric_nutrition
Failed to fetch https://en.wikipedia.org/wiki/Pediatric_nutrition
Found 0 suitable paragraphs (100-120 words)
Total paragraphs collected so far: 25/500
Fetching cont