In [None]:
!pip install wikipedia

import wikipedia
import re
import random
import time




In [None]:
# make sure text has only alphanumeric data
def remove_extra_characters(text):
    text = re.sub(r'[^a-zA-Z0-9 ]+', '', text)
    return text

In [None]:
# find count of keywords match
def relevance_score(text, keywords):
    score = sum(text.lower().count(keyword.lower()) for keyword in keywords)
    return score

In [None]:
# scrape a single Wikipedia page
def scrape_wikipedia_pages(title, topic, keywords):
    try:
        content = wikipedia.page(title, auto_suggest=False)
        summary = remove_extra_characters(content.summary)
        revision_id = content.revision_id
        title = content.title
        url = content.url

        document = {
            "revision_id": revision_id,
            "title": title,
            "url": url,
            "summary": summary,
            "topic": topic
        }

        score = relevance_score(summary, keywords) + relevance_score(title, keywords)
        document["score"] = score

        if len(summary) > 200 and score >= 1:
            return document, content.links
        else:
            return None, []

    except wikipedia.exceptions.DisambiguationError:
        print(f"disambiguation error for: {title}")
        return None, []
    except wikipedia.exceptions.PageError:
        print(f"page is not found: {title}")
        return None, []
    except Exception as e:
        print(f"some unexpected error occurred: {e}")
        return None, []

In [None]:
# scrape wikipedia data
def scrape_wikipedia_subtopics(main_topic, sub_topics, keywords, min_docs_per_topic):
    results = []
    extra_links = []
    unique_urls = set()
    print(f"Scraping summaries for main topic: {main_topic}")

    for sub_topic in sub_topics:
        print(f"  Scraping sub-topic: {sub_topic}")
        search_results = []
        attempts = 0
        max_attempts = 3
        while attempts < max_attempts:
            try:
                search_results = wikipedia.search(sub_topic, results=300)
                print(len(search_results))
                break
            except wikipedia.exceptions.HTTPTimeoutError:
                print("HTTP timeout error occurred. attempt num: ", attempts)
                attempts += 1
                wait_time = 2 ** attempts
                time.sleep(wait_time)
            except wikipedia.exceptions.WikipediaException as e:
                print(f"error occured during search: {e}")
                break

        for page_title in search_results:
            # print(f"current scraping page: {page_title}")
            document, links = scrape_wikipedia_pages(page_title, main_topic, keywords)
            if document and document['url'] not in unique_urls:
                results.append(document)
                unique_urls.add(document['url'])
                if links:
                  extra_links.extend(links)

            if len(results) >= min_docs_per_topic:
                break
        print(f"total documents scraped till topic: {len(results)}, {sub_topic}")
        if len(results) >= min_docs_per_topic:
            break
    # Scrape additional pages if needed
    print("total documents scrapped: ", len(results))
    print("total extra links: ", len(extra_links))
    while len(results) < min_docs_per_topic and len(extra_links) > 0:
        page_title = extra_links.pop()
        try:
            document, links = scrape_wikipedia_pages(page_title, main_topic, keywords)
            if document and document['url'] not in unique_urls:
                unique_urls.add(document['url'])
                results.append(document)
                # extra_links.extend(links)

            if len(results) >= min_docs_per_topic:  # Stop if we have enough documents
                break

        except wikipedia.exceptions.DisambiguationError:
            continue
        except Exception as e:
            continue

    # print("final total documents scrapped: ", len(results))
    results.sort(key=lambda x: x.get('score', 0), reverse=True)  # Sort directly on results
    # remove score term in document
    for doc in results:
        doc.pop('score', None)
    # send top 510 results
    return results

In [None]:
topics_list = {
    "Health": ["Common diseases", "Infectious diseases", "Mental health trends", "Health care system", "Global Health statistics"],
    "Environment": ["Global warming", "Climate Change", "Waste Management", "Greenhouse gases", "Deforestation rates", "Endangered species"],
    "Technology": ["Emerging technologies", "AI advancements", "Software Development", "Could Computing issues", "Computer programming"],
    "Economy": ["Stock market performance", "Job markets", "Cryptocurrency trends", "Bitcoin trends", "Trading strategies", "Currency Exchange rates"],
    "Entertainment": ["Music industry", "Popular cultural events", "Streaming platforms", "Film industry", "Digital media"],
    "Sports": ["Major sporting events", "Sports analytics", "Olympic Games", "Cricket test matches", "football tournaments"],
    "Politics": ["Elections", "Public policy analysis", "International relations", "Political parties"],
    "Education": ["Literacy rates", "Online education trends", "Student loan data", "higher education", "science education", "Education policy" ],
    "Travel": ["Top tourist destinations", "Airline industry data", "Travel trends", "International tourism", "World Tour", "Adventure travel", "Beach destinations", "Travel deals", "Business trip"],
    "Food": ["Organic Farming","Crop yield statistics", "Global hunger", "Food security", "food quality", "Protein food", "healthy drinks", "healthy food", "Fruits", "malnutrition", "dairy products"]
}


# keywords for each main topic
keywords = {
    "Health": ["common health diseases", "common disease", "common diseases", "diseases", "Infectious diseases", "health statistics", "Global health statistics", "health statistics", "global health","healthcare system", "Health care system", "health system", "mental health", "mental health trends"],
    "Environment": ["global warming", "climate change", "waste management", "endangered species","deforestation", "deforestation rates", "greenhouse gases", "co2", "manage waste"],
    "Technology": ["artificial intelligence", "machine learning", "blockchain", "AI advancements", "Artificial Intelligence Advancements", "Emerging technologies","quantum computing", "AI trends", "Artificial Intelligence trends", "Web development", "Software development", "API", "Could computing", "Software engineering", "Computer", "programming"],
    "Economy": ["stock market performance","stocks", "job","recession", "job loss", "trading", "trading strategy","trading strategies","currency", "currency exchange", "currency exchange rate" "Job markets", "Job market", "cryptocurrency trends", "cryptocurrency", "Bitcoin", "Bitcoin trends", "stock market"],
    "Entertainment": ["Music industry", "Popular events","cultural events", "cultural activities", "Music", "online streaming platforms", "Streaming platforms", "Film industry", "Film industries", "Music industries", "digital media"],
    "Sports": ["sports events", "Major sporting events", "sporting events", "Sports analytics", "Olympic games", "Olympic sports", "games", "Olympics", "test matches", "cricket", "cricket matches", "football tournaments", "football"],
    "Politics": ["Elections", "Public policy analysis", "International relations", "Political parties", "Public policy"],
    "Education": [ "Literacy rates", "Education policy","Online education trend", "Online education trends", "Student loan data", "Students loans data", "higher education", "student loan", "Online learning trends", "science", "science education"],
    "Travel": ["Top tourist destinations", "Airline industry data", "Travel trends", "International tourism","International travel", "Airline data", "tourist destinations", "best tourist places", "top tourist places", "World tour", "Adventure travel", "Beach destinations","Beaches", "travel deals", "Business trip"],
    "Food": ["Organic Farming","Crop yield statistics", "Global hunger", "Food security", "food quality", "Protein rich food", "protein food", "high protein food", "healthy drinks", "healthy food", "Fruits", "malnutrition",  "dairy products"]
}


In [None]:
total_documents = {}

In [None]:
# Scrap Health data
health_results = scrape_wikipedia_subtopics("Health", topics_list["Health"], keywords["Health"], 520)
total_documents["Health"] = health_results
print(len(health_results))

Scraping summaries for main topic: Health
  Scraping sub-topic: Common diseases
300




  lis = BeautifulSoup(html).find_all('li')


disambiguation error for: Paget's disease
total documents scraped till topic: 144, Common diseases
  Scraping sub-topic: Infectious diseases
300
disambiguation error for: National Institute of Infectious Diseases
total documents scraped till topic: 270, Infectious diseases
  Scraping sub-topic: Mental health trends
300
total documents scraped till topic: 413, Mental health trends
  Scraping sub-topic: Health care system
300
disambiguation error for: Health care (disambiguation)
total documents scraped till topic: 520, Health care system
total documents scrapped:  520
total extra links:  147333
520


In [None]:
environment_results = scrape_wikipedia_subtopics("Environment", topics_list["Environment"], keywords["Environment"], 650)
total_documents["Environment"] = environment_results
print(len(environment_results))

Scraping summaries for main topic: Environment
  Scraping sub-topic: Global warming
300
disambiguation error for: Global warming (disambiguation)
disambiguation error for: Unstoppable global warming
disambiguation error for: Climate change and agriculture
disambiguation error for: Climate change (disambiguation)
total documents scraped till topic: 209, Global warming
  Scraping sub-topic: Climate Change
300
disambiguation error for: Climate change (disambiguation)
disambiguation error for: Climate change and agriculture
disambiguation error for: Climate Change Bill
disambiguation error for: Climate change in Georgia
disambiguation error for: Climate policy
total documents scraped till topic: 393, Climate Change
  Scraping sub-topic: Waste Management
300
disambiguation error for: Waste management (disambiguation)
disambiguation error for: Radioactive waste disposal
disambiguation error for: Dumping
total documents scraped till topic: 517, Waste Management
  Scraping sub-topic: Greenhous

In [None]:
technology_results = scrape_wikipedia_subtopics("Technology", topics_list["Technology"], keywords["Technology"], 520)
total_documents["Technology"] = technology_results
print(len(technology_results))

Scraping summaries for main topic: Technology
  Scraping sub-topic: Emerging technologies
300
disambiguation error for: ICCT
disambiguation error for: The Net
total documents scraped till topic: 185, Emerging technologies
  Scraping sub-topic: AI advancements
300
total documents scraped till topic: 354, AI advancements
  Scraping sub-topic: Software Development
300
disambiguation error for: Development
total documents scraped till topic: 520, Software Development
total documents scrapped:  520
total extra links:  114395
520


In [None]:
economy_results = scrape_wikipedia_subtopics("Economy", topics_list["Economy"], keywords["Economy"], 650)
total_documents["Economy"] = economy_results
print(len(economy_results))

Scraping summaries for main topic: Economy
  Scraping sub-topic: Stock market performance
300
disambiguation error for: Hang Seng (disambiguation)
total documents scraped till topic: 179, Stock market performance
  Scraping sub-topic: Job markets
300
disambiguation error for: Job loss
disambiguation error for: EJM
disambiguation error for: Bax
total documents scraped till topic: 322, Job markets
  Scraping sub-topic: Cryptocurrency trends
300
total documents scraped till topic: 407, Cryptocurrency trends
  Scraping sub-topic: Bitcoin trends
300
total documents scraped till topic: 436, Bitcoin trends
  Scraping sub-topic: Trading strategies
300
disambiguation error for: Spread
disambiguation error for: Jelly roll
disambiguation error for: Strategy (disambiguation)
disambiguation error for: Basis
disambiguation error for: Gut
disambiguation error for: Reversal
disambiguation error for: Strip
disambiguation error for: Roll
total documents scraped till topic: 590, Trading strategies
  Scra

In [None]:
entertainment_results = scrape_wikipedia_subtopics("Entertainment", topics_list["Entertainment"], keywords["Entertainment"], 650)
total_documents["Entertainment"] = entertainment_results
print(len(entertainment_results))

Scraping summaries for main topic: Entertainment
  Scraping sub-topic: Music industry
300




  lis = BeautifulSoup(html).find_all('li')


disambiguation error for: Music Industry Arts
disambiguation error for: Industry
total documents scraped till topic: 267, Music industry
  Scraping sub-topic: Popular cultural events
300
total documents scraped till topic: 337, Popular cultural events
  Scraping sub-topic: Streaming platforms
300
disambiguation error for: Stream (disambiguation)
total documents scraped till topic: 507, Streaming platforms
  Scraping sub-topic: Film industry
300
disambiguation error for: Tollywood
disambiguation error for: Bengali film
disambiguation error for: Industry
total documents scraped till topic: 650, Film industry
total documents scrapped:  650
total extra links:  271292
650


In [None]:
sports_results = scrape_wikipedia_subtopics("Sports", topics_list["Sports"], keywords["Sports"], 650)
total_documents["Sports"] = sports_results
print(len(sports_results))

Scraping summaries for main topic: Sports
  Scraping sub-topic: Major sporting events
300
disambiguation error for: Postgame
disambiguation error for: Overlay
disambiguation error for: Pregame
total documents scraped till topic: 157, Major sporting events
  Scraping sub-topic: Sports analytics
300
disambiguation error for: SSAC
disambiguation error for: GA
disambiguation error for: Uba
total documents scraped till topic: 280, Sports analytics
  Scraping sub-topic: Olympic Games
300
disambiguation error for: Paris Olympics
disambiguation error for: London Olympics
disambiguation error for: Olympic
disambiguation error for: Greek Olympics
total documents scraped till topic: 542, Olympic Games
  Scraping sub-topic: Cricket test matches
300
disambiguation error for: Test match
total documents scraped till topic: 650, Cricket test matches
total documents scrapped:  650
total extra links:  298865
650


In [None]:
politics_results = scrape_wikipedia_subtopics("Politics", topics_list["Politics"], keywords["Politics"], 520)
total_documents["Politics"] = politics_results
print(len(politics_results))

Scraping summaries for main topic: Politics
  Scraping sub-topic: Elections
300




  lis = BeautifulSoup(html).find_all('li')


disambiguation error for: 2005 Iraqi elections
disambiguation error for: Elections in Ireland
total documents scraped till topic: 273, Elections
  Scraping sub-topic: Public policy analysis
300
disambiguation error for: Institute for Policy Research
total documents scraped till topic: 418, Public policy analysis
  Scraping sub-topic: International relations
300
disambiguation error for: Committee on International Relations
disambiguation error for: Institute of International Relations
disambiguation error for: International affairs (disambiguation)
disambiguation error for: Realism
total documents scraped till topic: 520, International relations
total documents scrapped:  520
total extra links:  285343
520


In [None]:
education_results = scrape_wikipedia_subtopics("Education", topics_list["Education"], keywords["Education"], 520)
total_documents["Education"] = education_results
print(len(education_results))

Scraping summaries for main topic: Education
  Scraping sub-topic: Literacy rates
300
total documents scraped till topic: 44, Literacy rates
  Scraping sub-topic: Online education trends
300
total documents scraped till topic: 127, Online education trends
  Scraping sub-topic: Student loan data
300
total documents scraped till topic: 218, Student loan data
  Scraping sub-topic: higher education
300




  lis = BeautifulSoup(html).find_all('li')


disambiguation error for: Higher Education Commission
disambiguation error for: Higher Education (disambiguation)
disambiguation error for: Minister for Higher Education
disambiguation error for: Higher Education Act
disambiguation error for: National Council for Higher Education
total documents scraped till topic: 409, higher education
  Scraping sub-topic: science education
300
disambiguation error for: Centre for Science Education
disambiguation error for: Ministry of Education, Science and Culture
disambiguation error for: Ministry of Education, Science and Technology
disambiguation error for: Education and Science Workers' Union
total documents scraped till topic: 520, science education
total documents scrapped:  520
total extra links:  159274
520


In [None]:
travel_results = scrape_wikipedia_subtopics("Travel", topics_list["Travel"], keywords["Travel"], 520)
total_documents["Travel"] = travel_results
print(len(travel_results))

Scraping summaries for main topic: Travel
  Scraping sub-topic: Top tourist destinations
300
total documents scraped till topic: 85, Top tourist destinations
  Scraping sub-topic: Airline industry data
300
disambiguation error for: ADR
disambiguation error for: Hermes (disambiguation)
disambiguation error for: PT
total documents scraped till topic: 89, Airline industry data
  Scraping sub-topic: Travel trends
300
total documents scraped till topic: 104, Travel trends
  Scraping sub-topic: International tourism
300
total documents scraped till topic: 168, International tourism
  Scraping sub-topic: World Tour
300
disambiguation error for: World Tour
total documents scraped till topic: 355, World Tour
  Scraping sub-topic: Adventure travel
300
disambiguation error for: Tim Cahill (disambiguation)
disambiguation error for: KE
disambiguation error for: Gap
disambiguation error for: Blue (disambiguation)
disambiguation error for: Outpost
disambiguation error for: Oat (disambiguation)
disamb

In [None]:
food_results = scrape_wikipedia_subtopics("Food", topics_list["Food"], keywords["Food"], 520)
total_documents["Food"] = food_results
print(len(food_results))

Scraping summaries for main topic: Food
  Scraping sub-topic: Organic Farming
300
disambiguation error for: Organic
disambiguation error for: Modern agriculture
total documents scraped till topic: 94, Organic Farming
  Scraping sub-topic: Crop yield statistics
300
disambiguation error for: Q (disambiguation)
total documents scraped till topic: 117, Crop yield statistics
  Scraping sub-topic: Global hunger
300
disambiguation error for: GHI
total documents scraped till topic: 170, Global hunger
  Scraping sub-topic: Food security
300
disambiguation error for: Security (disambiguation)
disambiguation error for: Food shortage
disambiguation error for: IPC
total documents scraped till topic: 294, Food security
  Scraping sub-topic: food quality
300
total documents scraped till topic: 344, food quality
  Scraping sub-topic: Protein food
300
total documents scraped till topic: 382, Protein food
  Scraping sub-topic: healthy drinks
300
disambiguation error for: Healthy food
total documents scr

In [139]:
import json

sw = common_stopwords = [
    'i', 'me', 'my', 'we', 'you', 'he', 'she', 'it', 'they',
    'what', 'which', 'who', 'this', 'that', 'these', 'those',
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'do', 'does', 'did', 'a', 'an', 'the',
    'and', 'but', 'if', 'or', 'because', 'as', 'with', 'for',
    'on', 'at', 'by', 'about', 'to', 'from', 'up', 'in', 'out'
]
# removing stopwords
def remove_stop_words(summary):
    words = summary.split()
    filtered_words = [word for word in words if word.lower() not in sw]
    return ' '.join(filtered_words)

final_data = {}

for topic in topics_list:
    documents = total_documents[topic]
    if len(documents) > 520:
        documents = documents[:520]
    preprocessed_documents = []
    for document in documents:
      preprocessed_summary = remove_stop_words(document['summary'])
      preprocessed_document = document.copy()
      preprocessed_document['summary'] = preprocessed_summary
      # convert revision_id to string
      preprocessed_document['revision_id'] = str(preprocessed_document['revision_id'])
      preprocessed_documents.append(preprocessed_document)

    final_data[topic] = preprocessed_documents


for topic in final_data:
    print(f"Total documents for {topic}: {len(final_data[topic])}")
# save the data to json file
with open('preprocessed_documents.json', 'w') as json_file:
    json.dump(final_data, json_file, indent=4)


Total documents for Health: 520
Total documents for Environment: 520
Total documents for Technology: 520
Total documents for Economy: 520
Total documents for Entertainment: 520
Total documents for Sports: 520
Total documents for Politics: 520
Total documents for Education: 520
Total documents for Travel: 520
Total documents for Food: 520


In [198]:
import json

# Data to be written to the JSON file
sub_data = {
    "ip": "34.85.253.117",
    "port": "8983",
    "core": "IRF24P1",
    "ubit": "hemaboka"
}

# Specify the file name
file_name = 'hemaboka_p1.json'

# Write data to the JSON file
with open(file_name, 'w') as json_file:
    json.dump(sub_data, json_file, indent=4)
