<a href="https://colab.research.google.com/github/jpereira2015/medport/blob/main/Wikicleanup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import bz2

def decompress_bz2_chunked(input_path, output_path, chunk_size=1024*1024):  # Default chunk size is 1 MB
    # Open the .bz2 file
    with bz2.open(input_path, 'rb') as file:
        # Open the output file
        with open(output_path, 'wb') as output_file:
            # Read and write in chunks
            while True:
                chunk = file.read(chunk_size)
                if not chunk:
                    break
                output_file.write(chunk)

# Specify the path to your .bz2 file and the output file name
input_path = '/content/drive/MyDrive/Colab_files/ptwiki-20240401-pages-articles-multistream.xml.bz2'
output_path = '/content/drive/MyDrive/Colab_files/output_file.xml'

# Call the function to decompress the file
decompress_bz2_chunked(input_path, output_path)


In [None]:
import re
import nltk
from lxml import etree
from nltk.tokenize import PunktSentenceTokenizer

# Load the Portuguese tokenizer from NLTK
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')

def clean_text(text):
    # Email replacement
    text = re.sub(r'\b[\w.-]+@[\w.-]+\.\w+\b', 'EMAIL', text)
    # Number replacement
    text = re.sub(r'\b\d+\b', '0', text)
    # URL replacement
    text = re.sub(r'https?://\S+|www\.\S+', 'URL', text)
    # Standardize different quotes to simple quotes
    text = re.sub(r'[“”„]', '"', text)
    text = re.sub(r"[‘’`]", "'", text)
    # Standardize hyphens
    text = re.sub(r'[-–—]', '-', text)
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove text within brackets
    text = re.sub(r'\[.*?\]|{.*?}|\(.*?\)', '', text)
    return text

def process_xml(file_path, output_path):
    context = etree.iterparse(file_path, events=('end',), tag='{http://www.mediawiki.org/xml/export-0.10/}text')
    with open(output_path, 'w', encoding='utf-8') as f:
        for event, elem in context:
            if elem.text:
                cleaned_text = clean_text(elem.text)
                sentences = tokenizer.tokenize(cleaned_text)
                for sentence in sentences:
                    # Ensure each sentence has at least 4 words
                    if len(sentence.split()) >= 4:
                        f.write(sentence + '\n')
            # Clear the element to free up memory
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]

# Specify your file paths here
input_xml_path = '/content/drive/MyDrive/Colab_files/output_file.xml'
output_txt_path = '/content/drive/MyDrive/Colab_files/output_file.txt'

# Process the XML and extract data
process_xml(input_xml_path, output_txt_path)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
!pip install mwparserfromhell


Collecting mwparserfromhell
  Downloading mwparserfromhell-0.6.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.0/191.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mwparserfromhell
Successfully installed mwparserfromhell-0.6.6


In [2]:
pip install mwparserfromhell

Collecting mwparserfromhell
  Downloading mwparserfromhell-0.6.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.0/191.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mwparserfromhell
Successfully installed mwparserfromhell-0.6.6


In [3]:
import mwparserfromhell
import xml.etree.ElementTree as ET
import re

def clean_wikicode(text):
    wikicode = mwparserfromhell.parse(text)
    clean_text = wikicode.strip_code().strip()
    # Regex to clean up additional unwanted markup and text
    clean_text = re.sub(r'\[\[(File|Image|Archivo|Ficheiro):[^\]]+\]\]', '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'(\d{1,4}px|miniaturadaimagem|thumb|thumbnail|direita|esquerda|centro)[\|_]', '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\[http[^\]]+\]', '', clean_text)
    clean_text = re.sub(r'\[\[Categoría:[^\]]+\]\]', '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'{{[^}]+}}', '', clean_text)
    return clean_text

def process_xml_in_batches(file_path, output_path):
    context = ET.iterparse(file_path, events=("start", "end"))
    context = iter(context)
    event, root = next(context)

    with open(output_path, 'w', encoding='utf-8') as output:
        page_count = 0
        while True:
            try:
                event, elem = next(context)
                if event == "end" and elem.tag.endswith("page"):
                    title_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}title')
                    text_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text')

                    if title_elem is not None and text_elem is not None and text_elem.text:
                        title = title_elem.text
                        cleaned_text = clean_wikicode(text_elem.text)
                        output.write(f"Title: {title}\n{cleaned_text}\n\n")

                    elem.clear()  # Clear processed element
                    root.clear()  # Also clear references from the root to the processed elements

                    page_count += 1
            except StopIteration:
                break  # Exit the loop if no more elements

        print(f"Processed {page_count} pages in total.")

# Specify your file paths here
input_xml_path = '/content/drive/MyDrive/Colab_files/output_file.xml'
output_txt_path = '/content/drive/MyDrive/Colab_files/output_file.txt'

# Start the batch processing
process_xml_in_batches(input_xml_path, output_txt_path)


Processed 2612651 pages in total.


In [1]:
def sample_text_file(file_path, num_lines=40):
    """ Function to read and print the first 'num_lines' lines of a text file. """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for _ in range(num_lines):
                line = file.readline()
                if not line:
                    break
                print(line, end='')  # Use end='' to avoid double newlines
    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Specify the path to your text file
file_path = '/content/drive/MyDrive/Colab_files/output_file.txt'

# Call the function to sample the first 10 lines of the file
sample_text_file(file_path)


Title: Astronomia
Formação estrelar na Grande Nuvem de Magalhães, uma galáxia irregular.
Mosaico da Nebulosa do Caranguejo, remanescente de uma supernova.
Astronomia é uma ciência natural que estuda corpos celestes (como estrelas, planetas, cometas, nebulosas, aglomerados de estrelas, galáxias) e fenômenos que se originam fora da atmosfera da Terra (como a radiação cósmica de fundo em micro-ondas). Preocupada com a evolução, a física e a química de objetos celestes, bem como a formação e o desenvolvimento do universo.

A astronomia é uma das mais antigas ciências. Culturas pré-históricas deixaram registrados vários artefatos astronômicos, como Stonehenge, os montes de Newgrange e os menires. As primeiras civilizações, como os babilônios, gregos, chineses, indianos, persas e maias realizaram observações metódicas do céu noturno. No entanto, a invenção do telescópio permitiu o desenvolvimento da astronomia moderna. Historicamente, a astronomia incluiu disciplinas tão diversas como astrom