In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Text datasets

In [None]:
import bz2
import re
import pandas as pd
import xml.etree.ElementTree as ET

# Path to the compressed file
file_path = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream1.xml-p1p41242.bz2'

# Function to extract information from each page
def extract_page_info(page):
    ns = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}
    page_id = page.find('mw:id', ns).text
    title = page.find('mw:title', ns).text
    revision = page.find('mw:revision', ns)
    text = revision.find('mw:text', ns).text if revision.find('mw:text', ns) is not None else ''
    return page_id, title, text

# Function to extract quality from the article text
def extract_quality(text):
    # Define regex patterns for GA and FA
    ga_pattern = re.compile(r'{{\s*Good article\s*}}', re.IGNORECASE)
    fa_pattern = re.compile(r'{{\s*Featured article\s*}}', re.IGNORECASE)

    if ga_pattern.search(text):
        return 'GA'
    elif fa_pattern.search(text):
        return 'FA'
    else:
        return 'NA'

# Initialize lists to store the extracted data
page_ids = []
titles = []
texts = []
qualities = []

# Decompress and parse the XML file
with bz2.open(file_path, 'rt', encoding='utf-8') as file:
    context = ET.iterparse(file, events=('end',))
    for event, elem in context:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.11/}page':
            page_id, title, text = extract_page_info(elem)
            quality = extract_quality(text)
            page_ids.append(page_id)
            titles.append(title)
            texts.append(text)
            qualities.append(quality)
            elem.clear()  # Clear the element to free memory

# Create a DataFrame
df1 = pd.DataFrame({
    'page_id': page_ids,
    'title': titles,
    'text': texts,
    'quality': qualities
})


In [None]:
df1

Unnamed: 0,page_id,title,text,quality
0,10,AccessibleComputing,#REDIRECT [[Computer accessibility]]\n\n{{rcat...,
1,12,Anarchism,{{short description|Political philosophy and m...,GA
2,13,AfghanistanHistory,#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,
3,14,AfghanistanGeography,#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,
4,15,AfghanistanPeople,#REDIRECT [[Demographics of Afghanistan]]\n\n{...,
...,...,...,...,...
27369,41237,Hierarchical routing,{{Short description|Network routing based on h...,
27370,41239,High-performance equipment,{{short description|Telecommunications equipme...,
27371,41240,Hop,{{Wiktionary|hop|hops|họp|hóp|hớp}}\nA '''hop'...,
27372,41241,Hop count,#REDIRECT[[Hop (networking)]]\n\n{{Redirect ca...,


In [None]:
df1['quality'].value_counts()

quality
NA    26158
GA      792
FA      424
Name: count, dtype: int64

In [None]:
df1.to_csv('/content/drive/MyDrive/학교/Dissertation/text1.csv')

In [None]:
# Path to the compressed file
file_path = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream2.xml-p41243p151573.bz2'

# Function to extract information from each page
def extract_page_info(page):
    ns = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}
    page_id = page.find('mw:id', ns).text
    title = page.find('mw:title', ns).text
    revision = page.find('mw:revision', ns)
    text = revision.find('mw:text', ns).text if revision.find('mw:text', ns) is not None else ''
    return page_id, title, text

# Function to extract quality from the article text
def extract_quality(text):
    # Define regex patterns for GA and FA
    ga_pattern = re.compile(r'{{\s*Good article\s*}}', re.IGNORECASE)
    fa_pattern = re.compile(r'{{\s*Featured article\s*}}', re.IGNORECASE)

    if ga_pattern.search(text):
        return 'GA'
    elif fa_pattern.search(text):
        return 'FA'
    else:
        return 'NA'

# Initialize lists to store the extracted data
page_ids = []
titles = []
texts = []
qualities = []

# Decompress and parse the XML file
with bz2.open(file_path, 'rt', encoding='utf-8') as file:
    context = ET.iterparse(file, events=('end',))
    for event, elem in context:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.11/}page':
            page_id, title, text = extract_page_info(elem)
            quality = extract_quality(text)
            page_ids.append(page_id)
            titles.append(title)
            texts.append(text)
            qualities.append(quality)
            elem.clear()  # Clear the element to free memory

# Create a DataFrame
df2 = pd.DataFrame({
    'page_id': page_ids,
    'title': titles,
    'text': texts,
    'quality': qualities
})


In [None]:
df2

Unnamed: 0,page_id,title,text,quality
0,41243,Hotline,{{Short description|Automatically directed poi...,
1,41244,Hybrid (biology),{{short description|Offspring of cross-species...,GA
2,41245,Hybrid balance,{{nosources|date=April 2019}}\nIn [[telecommun...,
3,41246,Hybrid transformer,{{Short description|Type of electrical transfo...,
4,41247,Hybrid routing,#redirect [[Routing in the PSTN]],
...,...,...,...,...
83497,151569,Toeplitz matrix,{{Short description|Matrix with shifting rows}...,
83498,151570,John Bird (actor),{{Short description|English actor (1936–2022)}...,
83499,151571,Charles Waterton,{{short description|English naturalist and exp...,
83500,151572,John Fortune,{{Short description|English actor and writer (...,


In [None]:
df2['quality'].value_counts()

quality
NA    82034
GA     1063
FA      405
Name: count, dtype: int64

In [None]:
df2.to_csv('/content/drive/MyDrive/학교/Dissertation/text2.csv')

In [None]:
# Path to the compressed file
file_path = '/content/drive/MyDrive/학교/Dissertation/enwiki-20240601-pages-articles-multistream3.xml-p151574p311329.bz2'

# Function to extract information from each page
def extract_page_info(page):
    ns = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}
    page_id = page.find('mw:id', ns).text
    title = page.find('mw:title', ns).text
    revision = page.find('mw:revision', ns)
    text = revision.find('mw:text', ns).text if revision.find('mw:text', ns) is not None else ''
    return page_id, title, text

# Function to extract quality from the article text
def extract_quality(text):
    # Define regex patterns for GA and FA
    ga_pattern = re.compile(r'{{\s*Good article\s*}}', re.IGNORECASE)
    fa_pattern = re.compile(r'{{\s*Featured article\s*}}', re.IGNORECASE)

    if ga_pattern.search(text):
        return 'GA'
    elif fa_pattern.search(text):
        return 'FA'
    else:
        return 'NA'

# Initialize lists to store the extracted data
page_ids = []
titles = []
texts = []
qualities = []

# Decompress and parse the XML file
with bz2.open(file_path, 'rt', encoding='utf-8') as file:
    context = ET.iterparse(file, events=('end',))
    for event, elem in context:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.11/}page':
            page_id, title, text = extract_page_info(elem)
            quality = extract_quality(text)
            page_ids.append(page_id)
            titles.append(title)
            texts.append(text)
            qualities.append(quality)
            elem.clear()  # Clear the element to free memory

# Create a DataFrame
df3 = pd.DataFrame({
    'page_id': page_ids,
    'title': titles,
    'text': texts,
    'quality': qualities
})


In [None]:
df3

Unnamed: 0,page_id,title,text,quality
0,151574,West Bretton,{{Short description|Village and civil parish i...,
1,151575,Cathay Pacific,{{Short description|Flag carrier and largest a...,
2,151576,"Bretton Hall, West Yorkshire",{{For|the college|Bretton Hall College of Educ...,
3,151577,Causality (physics),{{Short description|Physics of the cause–effec...,
4,151578,Wetherby,{{Short description|Town and civil parish in W...,
...,...,...,...,...
89653,311318,Reginald Hill,{{Short description|British crime writer}}\n{{...,
89654,311320,1911 revolution,#REDIRECT [[1911 Revolution]],
89655,311326,Kangxi,#REDIRECT [[Kangxi Emperor]]\n\n{{pp-vandalism...,
89656,311328,Warlordism,#REDIRECT [[Warlord]],


In [None]:
df3['quality'].value_counts()

quality
NA    87884
GA     1291
FA      483
Name: count, dtype: int64

In [None]:
df3.to_csv('/content/drive/MyDrive/학교/Dissertation/text3.csv')

### Combining dataframes

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection/text1.csv')
df2 = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection/text2.csv')
df3 = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection/text3.csv')

In [None]:
df3

Unnamed: 0.1,Unnamed: 0,page_id,title,text,quality
0,0,151574,West Bretton,{{Short description|Village and civil parish i...,
1,1,151575,Cathay Pacific,{{Short description|Flag carrier and largest a...,
2,2,151576,"Bretton Hall, West Yorkshire",{{For|the college|Bretton Hall College of Educ...,
3,3,151577,Causality (physics),{{Short description|Physics of the cause–effec...,
4,4,151578,Wetherby,{{Short description|Town and civil parish in W...,
...,...,...,...,...,...
89653,89653,311318,Reginald Hill,{{Short description|British crime writer}}\n{{...,
89654,89654,311320,1911 revolution,#REDIRECT [[1911 Revolution]],
89655,89655,311326,Kangxi,#REDIRECT [[Kangxi Emperor]]\n\n{{pp-vandalism...,
89656,89656,311328,Warlordism,#REDIRECT [[Warlord]],


In [None]:
text = pd.concat([df1, df2, df3])

In [None]:
text['quality'].fillna('NA', inplace=True)

In [None]:
text

Unnamed: 0,page_id,title,text,quality
0,10,AccessibleComputing,#REDIRECT [[Computer accessibility]]\n\n{{rcat...,
1,12,Anarchism,{{short description|Political philosophy and m...,GA
2,13,AfghanistanHistory,#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,
3,14,AfghanistanGeography,#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,
4,15,AfghanistanPeople,#REDIRECT [[Demographics of Afghanistan]]\n\n{...,
...,...,...,...,...
89653,311318,Reginald Hill,{{Short description|British crime writer}}\n{{...,
89654,311320,1911 revolution,#REDIRECT [[1911 Revolution]],
89655,311326,Kangxi,#REDIRECT [[Kangxi Emperor]]\n\n{{pp-vandalism...,
89656,311328,Warlordism,#REDIRECT [[Warlord]],


In [None]:
text['quality'].value_counts()

quality
NA    196076
GA      3146
FA      1312
Name: count, dtype: int64

In [None]:
text.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection/text.csv')