In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! pip install mwxml

Collecting mwxml
  Downloading mwxml-0.3.3-py2.py3-none-any.whl (32 kB)
Collecting mwcli>=0.0.2 (from mwxml)
  Downloading mwcli-0.0.3-py2.py3-none-any.whl (8.4 kB)
Collecting mwtypes>=0.3.0 (from mwxml)
  Downloading mwtypes-0.3.2-py2.py3-none-any.whl (21 kB)
Collecting para>=0.0.1 (from mwxml)
  Downloading para-0.0.8-py3-none-any.whl (6.5 kB)
Collecting docopt (from mwcli>=0.0.2->mwxml)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsonable>=0.3.0 (from mwtypes>=0.3.0->mwxml)
  Downloading jsonable-0.3.1-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=b7cb14b8901c98c2dc930437409c0e7b1537361bce2d204bf138c30f800bd93d
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Inst

In [6]:
import bz2
import mwxml
import pandas as pd
from tqdm import tqdm

# Paths
bz2_file = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream.xml.bz2'
xml_file = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream.xml'
csv_file = '/content/drive/MyDrive/학교/Dissertation/enwiki_quality_ratings.csv'

# Decompress the bz2 file
with bz2.open(bz2_file, 'rb') as f_in, open(xml_file, 'wb') as f_out:
    for data in iter(lambda: f_in.read(100 * 1024), b''):
        f_out.write(data)

# Define regex patterns to identify quality templates
import re
quality_patterns = {
    'FA': re.compile(r'\{\{FA\}\}'),
    'A': re.compile(r'\{\{A-Class\}\}'),
    'GA': re.compile(r'\{\{GA\}\}'),
    'B': re.compile(r'\{\{B-Class\}\}'),
    'C': re.compile(r'\{\{C-Class\}\}'),
    'Start': re.compile(r'\{\{Start-Class\}\}'),
    'Stub': re.compile(r'\{\{Stub-Class\}\}')
}

# Function to determine quality rating based on text
def get_quality_rating(text):
    for rating, pattern in quality_patterns.items():
        if pattern.search(text):
            return rating
    return 'Unassessed'

# Function to process each page and extract data
def process_page(page):
    title = page.title
    ns = page.namespace
    page_id = page.id

    # Extract the latest revision info
    if page.revisions:
        latest_revision = page.revisions[0]
        revision_id = latest_revision.id
        timestamp = latest_revision.timestamp
        contributor = latest_revision.contributor
        contributor_name = contributor.username if contributor.username else contributor.ip
        text = latest_revision.text or ""
        citation_count = len(re.findall(r'<ref[^>]*>', text))
        quality = get_quality_rating(text)

        return {
            'Title': title,
            'Namespace': ns,
            'PageID': page_id,
            'RevisionID': revision_id,
            'Timestamp': timestamp,
            'Contributor': contributor_name,
            'CitationCount': citation_count,
            'Quality': quality
        }
    return None

# Parse the XML dump and extract data
data = []
with mwxml.Dump.from_file(open(xml_file, 'rb')) as dump:
    for page in tqdm(dump, desc="Processing pages", unit=" pages"):
        if page.namespace == 0:  # Only consider articles in the main namespace
            page_data = process_page(page)
            if page_data:
                data.append(page_data)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv(csv_file, index=False)

print(f"Data extracted and saved to {csv_file}")


KeyboardInterrupt: 

In [7]:
data

b'roadway debut and received a Tony Award nomination for \'\'What the Constitution Means to Me\'\', which originated in Summerworks 2017.&lt;ref name=&quot;:1&quot; /&gt;\n\n==References==\n&lt;references/&gt;\n\n==External links==\n* [http://www.clubbedthumb.org Official website]\n\n[[Category:Theatre companies in New York City]]\n[[Category:Arts organizations established in 1996]]\n[[Category:1996 establishments in New York City]]</text>\n      <sha1>91mp6glaptoqxuaquvcsacz7lnj6wqz</sha1>\n    </revision>\n  </page>\n  <page>\n    <title>\xc3\x86\xc3\xb0ey</title>\n    <ns>0</ns>\n    <id>24077515</id>\n    <revision>\n      <id>1184832413</id>\n      <parentid>1182261378</parentid>\n      <timestamp>2023-11-12T22:32:46Z</timestamp>\n      <contributor>\n        <username>Gilgamesh~enwiki</username>\n        <id>47947</id>\n      </contributor>\n      <origin>1184832413</origin>\n      <model>wikitext</model>\n      <format>text/x-wiki</format>\n      <text bytes="4492" sha1="2rv2cgi

In [14]:
import bz2
import mwxml
import pandas as pd
from tqdm import tqdm
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

# Paths
bz2_file = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream.xml.bz2'
xml_file = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream.xml'
csv_file = '/content/drive/MyDrive/학교/Dissertation/enwiki_quality_ratings.csv'

# Decompress the bz2 file
with bz2.open(bz2_file, 'rb') as f_in, open(xml_file, 'wb') as f_out:
    for data in iter(lambda: f_in.read(100 * 1024), b''):
        f_out.write(data)

# Function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    vowels = "aeiouy"
    if word[0] in vowels:
        count = 1
    else:
        count = 0
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count = 1
    return count

# Function to calculate section sizes
def calculate_section_sizes(text):
    section_sizes = []
    sections = re.split(r'==[^=].*==', text)
    for section in sections:
        section_size = len(section.split())
        section_sizes.append(section_size)
    if section_sizes:
        longest_section = max(section_sizes)
        shortest_section = min(section_sizes)
        mean_section_size = sum(section_sizes) / len(section_sizes)
    else:
        longest_section = shortest_section = mean_section_size = 0
    return longest_section, shortest_section, mean_section_size

# Function to determine article quality
def determine_quality(text):
    quality_templates = {
        'FA': r'\{\{Featured[ _]article',
        'GA': r'\{\{Good[ _]article',
        'B': r'\{\{B[ _]class',
        'C': r'\{\{C[ _]class',
        'Start': r'\{\{Start[ _]class',
        'Stub': r'\{\{Stub[ _]class'
    }
    for quality, pattern in quality_templates.items():
        if re.search(pattern, text, re.IGNORECASE):
            return quality
    return 'Unknown'

# Function to process each page and extract data
def process_page(page):
    try:
        title = page.title
        ns = page.namespace
        page_id = page.id

        # Extract the latest revision info
        if page.revisions:
            latest_revision = page.revisions[0]
            revision_id = latest_revision.id
            timestamp = latest_revision.timestamp
            contributor = latest_revision.contributor
            contributor_name = contributor.username if contributor.username else contributor.ip
            comment = latest_revision.comment
            text = latest_revision.text or ""

            # Count citations by finding all <ref> tags
            citation_count = len(re.findall(r'<ref[^>]*>', text))
            character_count = len(text)

            # Word count
            words = word_tokenize(text)
            word_count = len(words)

            # Sentence count
            sentences = sent_tokenize(text)
            sentence_count = len(sentences)

            # Syllable count
            syllable_count = sum(count_syllables(word) for word in words)

            # Section and subsection counts (simple heuristic based on headers)
            section_count = len(re.findall(r'==[^=].*==', text))
            subsection_count = len(re.findall(r'===.*===', text))

            # Paragraph count
            paragraphs = re.split(r'\n\n+', text)
            paragraph_count = len(paragraphs)

            # Mean paragraph size
            mean_paragraph_size = sum(len(paragraph.split()) for paragraph in paragraphs) / paragraph_count if paragraph_count > 0 else 0

            # Section size metrics
            longest_section, shortest_section, mean_section_size = calculate_section_sizes(text)
            longest_shortest_ratio = longest_section / shortest_section if shortest_section > 0 else float('inf')

            # Determine article quality
            quality = determine_quality(text)

            return {
                'Title': title,
                'Namespace': ns,
                'PageID': page_id,
                'RevisionID': revision_id,
                'Timestamp': timestamp,
                'Contributor': contributor_name,
                'Comment': comment,
                'CitationCount': citation_count,
                'CharacterCount': character_count,
                'WordCount': word_count,
                'SentenceCount': sentence_count,
                'SyllableCount': syllable_count,
                'SectionCount': section_count,
                'SubsectionCount': subsection_count,
                'ParagraphCount': paragraph_count,
                'MeanParagraphSize': mean_paragraph_size,
                'LongestSectionSize': longest_section,
                'ShortestSectionSize': shortest_section,
                'MeanSectionSize': mean_section_size,
                'LongestShortestRatio': longest_shortest_ratio,
                'Quality': quality
            }
    except Exception as e:
        print(f"Error processing page {page.id}: {e}")
    return None

# Parse the XML dump and extract data
data = []
quality_counts = {q: 0 for q in ['FA', 'GA', 'B', 'C', 'Start', 'Stub', 'Unknown']}
max_per_quality = 10

with mwxml.Dump.from_file(open(xml_file, 'rb')) as dump:
    for page in tqdm(dump, desc="Processing pages", unit=" pages"):
        if page.namespace == 0:  # Only consider articles in the main namespace
            page_data = process_page(page)
            if page_data:
                quality = page_data['Quality']
                if quality_counts[quality] < max_per_quality:
                    data.append(page_data)
                    quality_counts[quality] += 1
                if all(count >= max_per_quality for count in quality_counts.values()):
                    break

# Ensure all data entries are dictionaries
data = [d for d in data if isinstance(d, dict)]

# Convert to DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv(csv_file, index=False)

print(f"Data extracted and saved to {csv_file}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: 

In [16]:
data

b'of the Douay&amp;ndash;Rheims often remains the Bible of choice of more traditional English-speaking Catholics.&lt;ref&gt;{{Cite web|url=https://www.marianland.com/bibledouayrheims/main.htm|title=Douay-Rheims Bible by Baronius Press|website=www.marianland.com|access-date=2019-02-07}}&lt;/ref&gt;\n\n==Origin==\n[[File:Cartulaire douai 3 coll\xc3\xa9ges.JPG|thumb|right|Colleges at [[University of Douai]]]]\nFollowing the [[English Reformation]], some Catholics went in exile to the [[Europe|European mainland]]. The centre of English Catholicism was the [[English College, Douai|English College]] at [[Douai]] ([[University of Douai]], France) founded in 1568 by [[William Allen (cardinal)|William Allen]], formerly of [[Queen\'s College, Oxford]], and Canon of York, and subsequently [[Cardinal (Catholicism)|cardinal]], for the purpose of training priests to convert the English again to Catholicism. And it was here where the Catholic translation of the Bible into English was produced.\n\nA r

In [19]:
data2 = [d for d in data if isinstance(d, dict)]

In [21]:
df = pd.DataFrame(data2)
df

In [None]:
import bz2
import mwxml
import pandas as pd
from tqdm import tqdm
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

# Paths
bz2_file = 'enwiki-20240601-pages-articles-multistream.xml.bz2'
xml_file = 'enwiki-20240601-pages-articles-multistream.xml'
csv_file = 'enwiki_article_metrics_with_difficult_words.csv'

# Decompress the bz2 file
with bz2.open(bz2_file, 'rb') as f_in, open(xml_file, 'wb') as f_out:
    for data in iter(lambda: f_in.read(100 * 1024), b''):
        f_out.write(data)

# Function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    vowels = "aeiouy"
    if word[0] in vowels:
        count = 1
    else:
        count = 0
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count = 1
    return count

# Function to identify and count difficult words
def count_difficult_words(words):
    difficult_word_count = sum(1 for word in words if count_syllables(word) > 2)  # Adjust criteria as needed
    return difficult_word_count

# Function to process each page and extract data
def process_page(page):
    try:
        title = page.title
        ns = page.namespace
        page_id = page.id

        # Extract the latest revision info
        if page.revisions:
            latest_revision = page.revisions[0]
            revision_id = latest_revision.id
            timestamp = latest_revision.timestamp
            contributor = latest_revision.contributor
            contributor_name = contributor.username if contributor.username else contributor.ip
            comment = latest_revision.comment
            text = latest_revision.text or ""

            # Count citations by finding all <ref> tags
            citation_count = len(re.findall(r'<ref[^>]*>', text))
            character_count = len(text)

            # Word count
            words = word_tokenize(text)
            word_count = len(words)

            # Count difficult words
            difficult_word_count = count_difficult_words(words)

            # Sentence count
            sentences = sent_tokenize(text)
            sentence_count = len(sentences)

            # Syllable count
            syllable_count = sum(count_syllables(word) for word in words)

            # Section and subsection counts (simple heuristic based on headers)
            section_count = len(re.findall(r'==[^=].*==', text))
            subsection_count = len(re.findall(r'===.*===', text))

            # Paragraph count
            paragraphs = re.split(r'\n\n+', text)
            paragraph_count = len(paragraphs)

            # Mean paragraph size
            mean_paragraph_size = sum(len(paragraph.split()) for paragraph in paragraphs) / paragraph_count if paragraph_count > 0 else 0

            # Section size metrics
            longest_section, shortest_section, mean_section_size = calculate_section_sizes(text)
            longest_shortest_ratio = longest_section / shortest_section if shortest_section > 0 else float('inf')

            # Determine article quality
            quality = determine_quality(text)

            return {
                'Title': title,
                'Namespace': ns,
                'PageID': page_id,
                'RevisionID': revision_id,
                'Timestamp': timestamp,
                'Contributor': contributor_name,
                'Comment': comment,
                'CitationCount': citation_count,
                'CharacterCount': character_count,
                'WordCount': word_count,
                'DifficultWordCount': difficult_word_count,
                'SentenceCount': sentence_count,
                'SyllableCount': syllable_count,
                'SectionCount': section_count,
                'SubsectionCount': subsection_count,
                'ParagraphCount': paragraph_count,
                'MeanParagraphSize': mean_paragraph_size,
                'LongestSectionSize': longest_section,
                'ShortestSectionSize': shortest_section,
                'MeanSectionSize': mean_section_size,
                'LongestShortestRatio': longest_shortest_ratio,
                'Quality': quality
            }
    except Exception as e:
        print(f"Error processing page {page.id}: {e}")
    return None

# Parse the XML dump and extract data
data = []

with mwxml.Dump.from_file(open(xml_file, 'rb')) as dump:
    for page in tqdm(dump, desc="Processing pages", unit=" pages"):
        if page.namespace == 0:  # Only consider articles in the main namespace
            page_data = process_page(page)
            if page_data:
                data.append(page_data)

# Ensure all data entries are dictionaries
data = [d for d in data if isinstance(d, dict)]

# Convert to DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv(csv_file, index=False)

print(f"Data extracted and saved to {csv_file}")
