__Files Created__

dump (page_id, title, text)
- text1.csv
- text2.csv
- text3.csv

titles
- titles.csv
- titles_r.csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Dump data processing

Before collecting data through Wikipedia API, a list of titles is created to reduce time to request data

In [None]:
import bz2
import re
import pandas as pd
import xml.etree.ElementTree as ET

__The first dump__

In [None]:
# Path to the compressed file
file_path = '/content/drive/MyDrive/학교/Dissertation/Data Collection2/enwiki-20240601-pages-articles-multistream1.xml-p1p41242.bz2'

# Function to extract information from each page
def extract_page_info(page):
    ns = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}
    page_id = page.find('mw:id', ns).text
    title = page.find('mw:title', ns).text
    revision = page.find('mw:revision', ns)
    text = revision.find('mw:text', ns).text if revision.find('mw:text', ns) is not None else ''
    return page_id, title, text

# Function to extract quality from the article text (not used for the research)
def extract_quality(text):
    # Define regex patterns for GA and FA
    ga_pattern = re.compile(r'{{\s*Good article\s*}}', re.IGNORECASE)
    fa_pattern = re.compile(r'{{\s*Featured article\s*}}', re.IGNORECASE)

    if ga_pattern.search(text):
        return 'GA'
    elif fa_pattern.search(text):
        return 'FA'
    else:
        return 'NA'

# Initialize lists to store the extracted data
page_ids = []
titles = []
texts = []
qualities = []

# Decompress and parse the XML file
with bz2.open(file_path, 'rt', encoding='utf-8') as file:
    context = ET.iterparse(file, events=('end',))
    for event, elem in context:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.11/}page':
            page_id, title, text = extract_page_info(elem)
            quality = extract_quality(text)
            page_ids.append(page_id)
            titles.append(title)
            texts.append(text)
            qualities.append(quality)
            elem.clear()  # Clear the element to free memory

# Create a DataFrame
df1 = pd.DataFrame({
    'page_id': page_ids,
    'title': titles,
    'text': texts,
    'quality': qualities
})

In [None]:
df1

Unnamed: 0,page_id,title,text,quality
0,10,AccessibleComputing,#REDIRECT [[Computer accessibility]]\n\n{{rcat...,
1,12,Anarchism,{{short description|Political philosophy and m...,GA
2,13,AfghanistanHistory,#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,
3,14,AfghanistanGeography,#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,
4,15,AfghanistanPeople,#REDIRECT [[Demographics of Afghanistan]]\n\n{...,
...,...,...,...,...
27369,41237,Hierarchical routing,{{Short description|Network routing based on h...,
27370,41239,High-performance equipment,{{short description|Telecommunications equipme...,
27371,41240,Hop,{{Wiktionary|hop|hops|họp|hóp|hớp}}\nA '''hop'...,
27372,41241,Hop count,#REDIRECT[[Hop (networking)]]\n\n{{Redirect ca...,


In [None]:
df1.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/text1.csv', index=False)

__The second dump__

In [None]:
# Path to the compressed file
file_path = '/content/drive/MyDrive/학교/Dissertation/Data Collection2/enwiki-20240601-pages-articles-multistream2.xml-p41243p151573.bz2'

# Initialize lists to store the extracted data
page_ids = []
titles = []
texts = []
qualities = []

# Decompress and parse the XML file
with bz2.open(file_path, 'rt', encoding='utf-8') as file:
    context = ET.iterparse(file, events=('end',))
    for event, elem in context:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.11/}page':
            page_id, title, text = extract_page_info(elem)
            quality = extract_quality(text)
            page_ids.append(page_id)
            titles.append(title)
            texts.append(text)
            qualities.append(quality)
            elem.clear()  # Clear the element to free memory

# Create a DataFrame
df2 = pd.DataFrame({
    'page_id': page_ids,
    'title': titles,
    'text': texts,
    'quality': qualities
})

In [None]:
df2

Unnamed: 0,page_id,title,text,quality
0,41243,Hotline,{{Short description|Automatically directed poi...,
1,41244,Hybrid (biology),{{short description|Offspring of cross-species...,GA
2,41245,Hybrid balance,{{nosources|date=April 2019}}\nIn [[telecommun...,
3,41246,Hybrid transformer,{{Short description|Type of electrical transfo...,
4,41247,Hybrid routing,#redirect [[Routing in the PSTN]],
...,...,...,...,...
83497,151569,Toeplitz matrix,{{Short description|Matrix with shifting rows}...,
83498,151570,John Bird (actor),{{Short description|English actor (1936–2022)}...,
83499,151571,Charles Waterton,{{short description|English naturalist and exp...,
83500,151572,John Fortune,{{Short description|English actor and writer (...,


In [None]:
df2.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/text2.csv', index=False)

__The third dump__

In [None]:
# Path to the compressed file
file_path = '/content/drive/MyDrive/학교/Dissertation/Data Collection2/enwiki-20240601-pages-articles-multistream3.xml-p151574p311329.bz2'

# Initialize lists to store the extracted data
page_ids = []
titles = []
texts = []
qualities = []

# Decompress and parse the XML file
with bz2.open(file_path, 'rt', encoding='utf-8') as file:
    context = ET.iterparse(file, events=('end',))
    for event, elem in context:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.11/}page':
            page_id, title, text = extract_page_info(elem)
            quality = extract_quality(text)
            page_ids.append(page_id)
            titles.append(title)
            texts.append(text)
            qualities.append(quality)
            elem.clear()  # Clear the element to free memory

# Create a DataFrame
df3 = pd.DataFrame({
    'page_id': page_ids,
    'title': titles,
    'text': texts,
    'quality': qualities
})

In [None]:
df3

Unnamed: 0,page_id,title,text,quality
0,151574,West Bretton,{{Short description|Village and civil parish i...,
1,151575,Cathay Pacific,{{Short description|Flag carrier and largest a...,
2,151576,"Bretton Hall, West Yorkshire",{{For|the college|Bretton Hall College of Educ...,
3,151577,Causality (physics),{{Short description|Physics of the cause–effec...,
4,151578,Wetherby,{{Short description|Town and civil parish in W...,
...,...,...,...,...
89653,311318,Reginald Hill,{{Short description|British crime writer}}\n{{...,
89654,311320,1911 revolution,#REDIRECT [[1911 Revolution]],
89655,311326,Kangxi,#REDIRECT [[Kangxi Emperor]]\n\n{{pp-vandalism...,
89656,311328,Warlordism,#REDIRECT [[Warlord]],


In [None]:
df3.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/text3.csv', index=False)

### Extracting titles
- Unnecessary articles, such as redirect pages, wiktionary, and articles related to Wikipedia, are removed
- Remove ones that text starts with '#REDIRECT', '#redirect', '{{wiktionary', '{{Wiktionary', '{{wikitionary', or '{{Wikitionary'
- Remove ones that title starts with 'Wikipedia'



In [None]:
# Remove unnecessary articles
df1.drop(df1[(df1['text'].str.startswith('#REDIRECT'))|(df1['text'].str.startswith('#redirect'))].index, inplace=True)
df1.drop(df1[(df1['text'].str.startswith('{{wiktionary'))|(df1['text'].str.startswith('{{Wiktionary'))|(df1['text'].str.startswith('{{wikitionary'))|(df1['text'].str.startswith('{{Wikitionary'))].index, inplace=True)
df1.reset_index(drop=True, inplace=True)
df1

Unnamed: 0,page_id,title,text,quality
0,12,Anarchism,{{short description|Political philosophy and m...,GA
1,39,Albedo,{{Short description|Ratio of how much light is...,
2,290,A,{{Short description|First letter of the Latin ...,
3,303,Alabama,{{Short description|U.S. state}}\n{{about|the ...,
4,305,Achilles,{{short description|Greek mythological hero}}\...,
...,...,...,...,...
20371,41233,H channel,{{Short description|High-speed communication c...,
20372,41234,Heterodyne,{{Short description|Signal processing techniqu...,
20373,41236,Heuristic routing,'''Heuristic routing''' is a system used to de...,
20374,41237,Hierarchical routing,{{Short description|Network routing based on h...,


In [None]:
title1 = df1[['title', 'quality']]
title1

Unnamed: 0,title,quality
0,Anarchism,GA
1,Albedo,
2,A,
3,Alabama,
4,Achilles,
...,...,...
20371,H channel,
20372,Heterodyne,
20373,Heuristic routing,
20374,Hierarchical routing,


In [None]:
# Remove unnecessary articles
df2.drop(df2[(df2['text'].str.startswith('#REDIRECT'))|(df2['text'].str.startswith('#redirect'))].index, inplace=True)
df2.drop(df2[(df2['text'].str.startswith('{{wiktionary'))|(df2['text'].str.startswith('{{Wiktionary'))|(df2['text'].str.startswith('{{wikitionary'))|(df2['text'].str.startswith('{{Wikitionary'))].index, inplace=True)
df2.reset_index(drop=True, inplace=True)
df2

Unnamed: 0,page_id,title,text,quality
0,41243,Hotline,{{Short description|Automatically directed poi...,
1,41244,Hybrid (biology),{{short description|Offspring of cross-species...,GA
2,41245,Hybrid balance,{{nosources|date=April 2019}}\nIn [[telecommun...,
3,41246,Hybrid transformer,{{Short description|Type of electrical transfo...,
4,41248,Hydroxyl ion absorption,{{Short description|Transmission loss in optic...,
...,...,...,...,...
65620,151569,Toeplitz matrix,{{Short description|Matrix with shifting rows}...,
65621,151570,John Bird (actor),{{Short description|English actor (1936–2022)}...,
65622,151571,Charles Waterton,{{short description|English naturalist and exp...,
65623,151572,John Fortune,{{Short description|English actor and writer (...,


In [None]:
title2 = df2[['title', 'quality']]
title2

Unnamed: 0,title,quality
0,Hotline,
1,Hybrid (biology),GA
2,Hybrid balance,
3,Hybrid transformer,
4,Hydroxyl ion absorption,
...,...,...
65620,Toeplitz matrix,
65621,John Bird (actor),
65622,Charles Waterton,
65623,John Fortune,


In [None]:
# Remove unnecessary articles
df3.drop(df3[(df3['text'].str.startswith('#REDIRECT'))|(df3['text'].str.startswith('#redirect'))].index, inplace=True)
df3.drop(df3[(df3['text'].str.startswith('{{wiktionary'))|(df3['text'].str.startswith('{{Wiktionary'))|(df3['text'].str.startswith('{{wikitionary'))|(df3['text'].str.startswith('{{Wikitionary'))].index, inplace=True)
df3.reset_index(drop=True, inplace=True)
df3

Unnamed: 0,page_id,title,text,quality
0,151574,West Bretton,{{Short description|Village and civil parish i...,
1,151575,Cathay Pacific,{{Short description|Flag carrier and largest a...,
2,151576,"Bretton Hall, West Yorkshire",{{For|the college|Bretton Hall College of Educ...,
3,151577,Causality (physics),{{Short description|Physics of the cause–effec...,
4,151578,Wetherby,{{Short description|Town and civil parish in W...,
...,...,...,...,...
52980,311305,Intelligentsia,{{short description|Status class of university...,
52981,311306,Boch Center,{{Infobox venue\n| name = Th...,
52982,311312,Wang Theatre,"{{Short description|Theater in Boston, Massach...",
52983,311315,Shubert Theatre,'''Shubert Theatre''' or '''Shubert Theater'''...,


In [None]:
title3 = df3[['title', 'quality']]
title3

Unnamed: 0,title,quality
0,West Bretton,
1,Cathay Pacific,
2,"Bretton Hall, West Yorkshire",
3,Causality (physics),
4,Wetherby,
...,...,...
52980,Intelligentsia,
52981,Boch Center,
52982,Wang Theatre,
52983,Shubert Theatre,


----------

In [None]:
# Three dataframes combined
titles = pd.concat([title1, title2, title3])
titles.reset_index(drop=True, inplace=True)
titles

Unnamed: 0,title,quality
0,Anarchism,GA
1,Albedo,
2,A,
3,Alabama,
4,Achilles,
...,...,...
138981,Intelligentsia,
138982,Boch Center,
138983,Wang Theatre,
138984,Shubert Theatre,


In [None]:
titles['title'] = titles['title'].astype(str)

---------

In [None]:
titles[titles['title'].str.startswith('Wikipedia')]

Unnamed: 0,title,quality
1699,Wikipedia:Building Wikipedia membership,
1700,Wikipedia:Building Wikipedia membership/Encycl...,
1701,Wikipedia:Building Wikipedia membership/Sample...,
1702,Wikipedia:Building Wikipedia membership/Anothe...,
1703,Wikipedia:Building Wikipedia membership/Ideas ...,
...,...,...
137532,Wikipedia:Community case RK,
137751,Wikipedia:Politics,
137777,Wikipedia:Political dispute,
138219,Wikipedia:WikiProject Aircraft,


In [None]:
# Remove unnecessary articles
titles.drop(titles[titles['title'].str.startswith('Wikipedia')].index, inplace=True)
titles.reset_index(drop=True, inplace=True)
titles

Unnamed: 0,title,quality
0,Anarchism,GA
1,Albedo,
2,A,
3,Alabama,
4,Achilles,
...,...,...
138412,Intelligentsia,
138413,Boch Center,
138414,Wang Theatre,
138415,Shubert Theatre,


In [None]:
titles.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/titles.csv', index=False)

### Extracting titles for quality class, FA and GA

- Found that most of FA and GA articles appears to be NULL values when article quality is collected through Wikipedia API, but includes FA and GA tags in the text
- To have an insight which articles might be FA and GA, which might be different from the dump as articles get updated, a list of titles where articles appear to be FA and GA in the dump

In [None]:
titles['quality'].value_counts()

quality
NA    50950
GA     3146
FA     1312
Name: count, dtype: int64

In [None]:
titles_r = titles[(titles['quality']=='GA')|(titles['quality']=='FA')]
titles_r

Unnamed: 0,title,quality
0,Anarchism,GA
5,Abraham Lincoln,GA
6,Aristotle,GA
14,Ayn Rand,GA
34,Amphibian,FA
...,...,...
138304,Bat'leth,GA
138306,Starflight,GA
138364,Margaret Fuller,FA
138372,St Pancras railway station,GA


In [None]:
titles_r.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/titles_r.csv')