In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install mwxml

Collecting mwxml
  Downloading mwxml-0.3.3-py2.py3-none-any.whl (32 kB)
Collecting mwcli>=0.0.2 (from mwxml)
  Downloading mwcli-0.0.3-py2.py3-none-any.whl (8.4 kB)
Collecting mwtypes>=0.3.0 (from mwxml)
  Downloading mwtypes-0.3.2-py2.py3-none-any.whl (21 kB)
Collecting para>=0.0.1 (from mwxml)
  Downloading para-0.0.8-py3-none-any.whl (6.5 kB)
Collecting docopt (from mwcli>=0.0.2->mwxml)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsonable>=0.3.0 (from mwtypes>=0.3.0->mwxml)
  Downloading jsonable-0.3.1-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=ab6898b147fbda2099c7d6fb68611f84d83e4a0477bdb85c320a1853097edbc2
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Inst

In [None]:
import bz2
import mwxml
import pandas as pd
from tqdm import tqdm
import re
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from datetime import datetime, timedelta
#from mwxml import Timestamp

nltk.download('punkt')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [None]:
# Paths
bz2_file = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream.xml.bz2'
xml_file = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream.xml'
csv_file = '/content/drive/MyDrive/학교/Dissertation/enwiki_quality_ratings.csv'

In [None]:
# Decompress the bz2 file
with bz2.open(bz2_file, 'rb') as f_in, open(xml_file, 'wb') as f_out:
    for data in iter(lambda: f_in.read(100 * 1024), b''):
        f_out.write(data)

In [None]:
# Function to determine article quality
def determine_quality(text):
    quality_templates = {
        'FA': r'\{\{Featured[ _]article',
        'GA': r'\{\{Good[ _]article',
        'B': r'\{\{B[ _]class',
        'C': r'\{\{C[ _]class',
        'Start': r'\{\{Start[ _]class',
        'Stub': r'\{\{Stub[ _]class'
    }
    for quality, pattern in quality_templates.items():
        if re.search(pattern, text, re.IGNORECASE):
            return quality
    return 'Unknown'

# Function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    vowels = "aeiouy"
    if word[0] in vowels:
        count = 1
    else:
        count = 0
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count = 1
    return count

# Function to identify complex words
def is_complex_word(word):
    syllables = count_syllables(word)
    # Define criteria for complex words, e.g., more than two syllables
    return syllables > 2

# Function to calculate section sizes
def calculate_section_sizes(text):
    section_sizes = []
    sections = re.split(r'==[^=].*==', text)
    for section in sections:
        section_size = len(section.split())
        section_sizes.append(section_size)
    if section_sizes:
        longest_section = max(section_sizes)
        shortest_section = min(section_sizes)
        mean_section_size = sum(section_sizes) / len(section_sizes)
    else:
        longest_section = shortest_section = mean_section_size = 0
    return longest_section, shortest_section, mean_section_size

# Function to count external links
def count_external_links(text):
    external_links = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    return len(external_links)

# Function to count internal links
def count_internal_links(text):
    internal_links = re.findall(r'\[\[([^\]|]+)(?:\|([^\]]+))?\]\]', text)
    return len(internal_links)

# Function to count images
def count_images(text):
    image_count = len(re.findall(r'\[\[File:[^\]]*\]\]', text)) + len(re.findall(r'\[\[Image:[^\]]*\]\]', text))
    return image_count

# Function to count questions
def count_questions(text):
    return len(re.findall(r'\?', text))

# Function to count exclamations
def count_exclamations(text):
    return len(re.findall(r'\!', text))

# Function to count sentences starting with a pronoun
def count_sentences_starting_with_pronoun(sentences):
    pronoun_tags = {'PRP', 'PRP$', 'WP', 'WP$'}
    count = 0
    for sentence in sentences:
        words = word_tokenize(sentence)
        if words:
            first_word = pos_tag([words[0]])[0]
            if first_word[1] in pronoun_tags:
                count += 1
    return count

# Function to calculate revert count
def calculate_revert_count(page):
    revision_count = len(page.revisions)
    revert_count = 0
    for i in range(1, revision_count):
        if page.revisions[i].parent_id != page.revisions[i - 1].id:
            revert_count += 1
    return revert_count

In [None]:
# Function to process each page and extract data
def process_page(page):
    title = page.title
    ns = page.namespace
    page_id = page.id

    # Extract the latest revision info
    if page.revisions:
        latest_revision = page.revisions[0]
        creation_date = page.revisions[-1].timestamp
        revision_id = latest_revision.id
        timestamp = latest_revision.timestamp
        contributor = latest_revision.contributor
        contributor_name = contributor.username if contributor.username else contributor.ip
        comment = latest_revision.comment
        text = latest_revision.text or ""

        # text
        text = latest_revision.text or ""
        character_count = len(text)
        words = word_tokenize(text)
        word_count = len(words)
        sentences = sent_tokenize(text)
        sentence_count = len(sentences)
        syllable_count = sum(count_syllables(word) for word in words)
        complex_word_count = sum(1 for word in words if is_complex_word(word))

        # structure
        section_count = len(re.findall(r'==[^=].*==', text))
        subsection_count = len(re.findall(r'===.*===', text))
        paragraphs = re.split(r'\n\n+', text)
        paragraph_count = len(paragraphs)
        mean_paragraph_size = sum(len(paragraph.split()) for paragraph in paragraphs) / paragraph_count if paragraph_count > 0 else 0
        longest_section, shortest_section, mean_section_size = calculate_section_sizes(text)
        longest_shortest_ratio = longest_section / shortest_section if shortest_section > 0 else float('inf')
        citation_count = len(re.findall(r'<ref[^>]*>', text))
        external_link_count = count_external_links(text)
        internal_link_count = count_internal_links(text)
        links_per_text_length = (external_link_count + internal_link_count) / character_count if character_count > 0 else 0
        image_count = count_images(text)
        images_per_text_length = image_count / character_count if character_count > 0 else 0

        # style
        sentence_lengths = [len(sentence.split()) for sentence in sentences]
        mean_sentence_size = sum(sentence_lengths) / sentence_count if sentence_count > 0 else 0
        largest_sentence_size = max(sentence_lengths) if sentence_lengths else 0
        shortest_sentence_size = min(sentence_lengths) if sentence_lengths else 0
        question_count = count_questions(text)
        question_ratio = question_count / sentence_count if sentence_count > 0 else 0
        exclamation_count = count_exclamations(text)
        exclamation_ratio = exclamation_count / sentence_count if sentence_count > 0 else 0
        sentences_starting_with_pronoun_count = count_sentences_starting_with_pronoun(sentences)

        # review
        current_date = datetime.utcnow()
        article_age_days = (current_date - creation_date).days
        revision_dates = [rev.timestamp for rev in page.revisions]
        revision_intervals = [(revision_dates[i] - revision_dates[i - 1]).days for i in range(1, len(revision_dates))]
        mean_revision_age = sum(revision_intervals) / len(revision_intervals) if revision_intervals else 0
        review_count = len(page.revisions)
        reviews_per_day = review_count / article_age_days if article_age_days > 0 else 0
        contributors = [rev.contributor.username if rev.contributor.username else rev.contributor.ip for rev in page.revisions]
        unique_contributors = set(contributors)
        user_count = len(unique_contributors)
        reviews_per_user = review_count / user_count if user_count > 0 else 0
        registered_users = [user for user in unique_contributors if re.match(r'[a-zA-Z0-9_]+', user)]
        anonymous_users = [user for user in unique_contributors if re.match(r'(\d{1,3}\.){3}\d{1,3}', user)]
        registered_user_count = len(registered_users)
        anonymous_user_count = len(anonymous_users)
        contributor_counts = {user: contributors.count(user) for user in unique_contributors}
        occasional_user_count = sum(1 for count in contributor_counts.values() if count <= 2)
        diversity = user_count / review_count if review_count > 0 else 0
        discussion_count = sum(1 for rev in page.revisions if rev.comment and 'talk' in rev.comment.lower())
        revert_count = calculate_revert_count(page)

        # quality
        quality = determine_quality(text)
        return {
            'title': title,
            'namespace': ns,
            'page_id': page_id,
            'revision_id': revision_id,
            'timestamp': timestamp,
            'contributor': contributor_name,
            'comment': comment,

            'character_count': character_count,
            'word_count': word_count,
            'sentence_count': sentence_count,
            'syllable_count': syllable_count,
            'complex_word_count': complex_word_count,

            'section_count': section_count,
            'subsection_count': subsection_count,
            'paragraph_count': paragraph_count,
            'mean_section_size': mean_section_size,
            'mean_paragraph_size': mean_paragraph_size,
            'longest_section_size': longest_section,
            'shortest_section_size': shortest_section,
            'longest_shortest_ratio': longest_section,
            'citation_count': citation_count,
            'external_link_count': external_link_count,
            'internal_link_count': internal_link_count,
            'links_per_text_length': links_per_text_length,
            'image_count': image_count,
            'images_per_text_length': images_per_text_length,

            'mean_sentence_size': mean_sentence_size,
            'largest_sentence_size': largest_sentence_size,
            'shortest_sentence_size': shortest_sentence_size,
            'question_count': question_count,
            'question ratio': question_ratio,
            'exclamation_count': exclamation_count,
            'exclamation_ratio': exclamation_ratio,
            'sentences_starting_with_pronount': sentences_starting_with_pronoun_count,

            'article_age_days': article_age_days,
            'mean_revision_age': mean_revision_age,
            'review_count': review_count,
            'reviews_per_day': reviews_per_day,
            'user_count': user_count,
            'reviews_per_user': reviews_per_user,
            'registered_user_count': registered_user_count,
            'anonymous_user_count': anonymous_user_count,
            'occasional_user_count': occasional_user_count,
            'diversity': diversity,
            'discussion_count': discussion_count,
            'revert_count': revert_count,

            'quality': quality
        }

    return None

In [None]:
# Parse the XML dump and extract data
data = []

with mwxml.Dump.from_file(open(xml_file, 'rb')) as dump:
    for page in tqdm(dump, desc="Processing pages", unit=" pages"):
        if page.namespace == 0:  # Only consider articles in the main namespace
            page_data = process_page(page)
            if page_data:
                data.append(page_data)

# Ensure all data entries are dictionaries
data2 = [d for d in data if isinstance(d, dict)]

ParseError: no element found: line 1, column 0: b''... (<string>)

In [None]:
# Convert to DataFrame and save to CSV
df = pd.DataFrame(data2)
df.to_csv(csv_file, index=False)

In [None]:
df