The original code:

In [None]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)

totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None
start_time = time.time()

with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'redirect'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    templateWriter.writerow(['id', 'title'])
    
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
            elif tname == 'title':
                    title = elem.text

            elif tname == 'id' and not inrevision and elem.text!=None:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.get('title', '')
            elif tname == 'ns' and elem.text!=None:
                ns = int(elem.text)
        elif tname == 'page':
            totalCount += 1

            if ns == 10:
                templateCount += 1
                templateWriter.writerow([id, title])
            elif len(redirect) > 0:
                articleCount += 1
                articlesWriter.writerow([id, title, redirect])
            else:
                redirectCount += 1
                redirectWriter.writerow([id, title, redirect])

            if totalCount > 1 and (totalCount % 100000) == 0:
                print("{:,}".format(totalCount))

        elem.clear()
        
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

passed the original code to chatGPT and below is the comment:

Your code snippet appears to be for processing a Wikipedia dump file and extracting information about articles, redirects, and templates. I've reviewed the code and found it to be well-written. However, I have a couple of suggestions to make the code more efficient and readable.

It's a good practice to use functions to break down the code into smaller, more manageable pieces. You can create functions for specific tasks like processing pages or writing to CSV files.

When parsing the XML, you might want to consider using a more efficient library like lxml instead of xml.etree.ElementTree. The lxml library can significantly speed up XML processing.

Here's a modified version of your code with the suggested improvements:

In [2]:
import os
import csv
import time
import codecs
from lxml import etree

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def strip_tag_name(t):
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

def process_page(title, id, ns, redirect, articlesWriter, redirectWriter, templateWriter):
    if ns == 10:
        templateWriter.writerow([id, title])
    elif len(redirect) > 0:
        articlesWriter.writerow([id, title, redirect])
    else:
        redirectWriter.writerow([id, title, redirect])

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)

totalCount = 0
start_time = time.time()

with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'redirect'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    templateWriter.writerow(['id', 'title'])
    
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                inrevision = True
            elif tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.get('title', '')
            elif tname == 'ns':
                ns = int(elem.text)
            elif event == 'end' and tname == 'page':
                totalCount += 1
                process_page(title, id, ns, redirect, articlesWriter, redirectWriter, templateWriter)

        if totalCount > 1 and (totalCount % 100000) == 0:
            print("{:,}".format(totalCount))

        elem.clear()
    
    time_took = time.time() - start_time
    print(f"Total runtime: {hms_string(time_took)}")

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

This updated version includes a new `process_page` function that handles the processing and writing of data for each page. 

The `lxml` library has been imported to parse the XML file more efficiently. 

Other than these changes, the overall structure and functionality of the code remain the same.

Then I passed the XML sample format to chatGPT and below is the comment:

Thank you for providing the sample XML dump file from Wikipedia. It has the following structure:

1. `<mediawiki>`: The root element of the XML file, containing information about the Wikipedia dump.
2. `<siteinfo>`: Contains information about the Wikipedia site, such as the sitename, dbname, base URL, generator, case, and namespaces.
3. `<page>`: Represents a Wikipedia article or page.
    - `<title>`: The title of the article or page.
    - `<ns>`: The namespace of the article. For example, "0" for the main namespace.
    - `<id>`: The unique identifier of the article.
    - `<redirect>`: (Optional) If the page is a redirect, this element contains the title of the target page.
    - `<revision>`: Contains information about the latest revision of the article.
        - `<id>`: The unique identifier of the revision.
       - `<parentid>`: The unique identifier of the parent revision.
       - `<timestamp>`: The time when the revision was made.
       - `<contributor>`: Contains information about the user who made the revision.
            - `<username>`: The username of the contributor.
            - `<id>`: The unique identifier of the contributor.
        - `<comment>`: The comment left by the contributor for the revision.
        - `<model>`: The content model of the article.
        - `<format>`: The content format of the article.
        - `<text>`: The actual content of the article in WikiText format.
        - `<sha1>`: The SHA1 hash of the article content.

The exml file sample format:

In [None]:
# <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
#   <siteinfo>
#     <sitename>Wikipedia</sitename>
#     <dbname>enwiki</dbname>
#     <base>https://en.wikipedia.org/wiki/Main_Page</base>
#     <generator>MediaWiki 1.29.0-wmf.12</generator>
#     <case>first-letter</case>
#     <namespaces>
#       ...
#     </namespaces>
#   </siteinfo>
#   <page>
#     <title>AccessibleComputing</title>
#     <ns>0</ns>
#     <id>10</id>
#     <redirect title="Computer accessibility" />
#     <revision>
#       <id>631144794</id>
#       <parentid>381202555</parentid>
#       <timestamp>2014-10-26T04:50:23Z</timestamp>
#       <contributor>
#         <username>Paine Ellsworth</username>
#         <id>9092818</id>
#       </contributor>
#       <comment>add [[WP:RCAT|rcat]]s</comment>
#       <model>wikitext</model>
#       <format>text/x-wiki</format>
#       <text xml:space="preserve">#REDIRECT [[Computer accessibility]]

# \{\{Redr|move|from CamelCase|up\}\}</text>
#       <sha1>4ro7vvppa5kmm0o1egfjztzcwd0vabw</sha1>
#     </revision>
#   </page>
#   <page>
#     <title>Anarchism</title>
#     <ns>0</ns>
#     <id>12</id>
#     <revision>
#       <id>766348469</id>
#       <parentid>766047928</parentid>
#       <timestamp>2017-02-19T18:08:07Z</timestamp>
#       <contributor>
#         <username>GreenC bot</username>
#         <id>27823944</id>
#       </contributor>
#       <minor />
#       <comment>Reformat 1 archive link. [[User:Green Cardamom/WaybackMedic_2.1|Wayback Medic 2.1]]</comment>
#       <model>wikitext</model>
#       <format>text/x-wiki</format>

# <text xml:space="preserve">
# ...
# </text>
# </mediawiki>

chatGPT comment:

The Python code you provided earlier parses this XML structure and extracts the relevant information, such as the page ID, title, and redirect target (if applicable). It then writes this information to different CSV files for articles, redirects, and templates.

If you have any questions or need further assistance with the code, please feel free to ask.

The Python code you provided earlier parses this XML structure and extracts the relevant information, such as the page ID, title, and redirect target (if applicable). It then writes this information to different CSV files for articles, redirects, and templates.


Hoss:
the code need to create a csv file with columns including:

- Article title
- Synonyms: (titles which refer to the article as the redirect)
- Subjects: a Python list containing all of the subject tags that are referred to in the article
- Text: First introductory paragraph of the article

chatGPT:
I've modified the code to create a CSV file with the desired columns. Please note that extracting the first introductory paragraph from the text might not be perfect,

but this code will attempt to do so using the assumption that the first paragraph ends with the first period followed by two newline characters.


In [6]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def strip_tag_name(t):
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

def get_first_paragraph(text):
    first_paragraph = re.search(r'(?s)(?<=\n\n).*?(?=\n\n)', text)
    return first_paragraph.group(0) if first_paragraph else ""

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

redirects = {}
articles = {}

start_time = time.time()

for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
    tname = strip_tag_name(elem.tag)

    if tname == 'page':
        title = elem.findtext('{*}title')
        # ns = int(elem.findtext('{*}ns'))
        ns_text = elem.findtext('{*}ns')
        ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
        # id = int(elem.findtext('{*}id'))
        id_text = elem.findtext('{*}id')
        id = int(id_text) if id_text is not None and id_text.isdigit() else None
        redirect = elem.find('{*}redirect')
        text_elem = elem.find('{*}revision/{*}text')

        if text_elem is not None:
            text = text_elem.text
            first_paragraph = get_first_paragraph(text)
        else:
            first_paragraph = ""

        if ns == 0:
            if redirect is not None:
                redirect_title = redirect.get('title', '')
                redirects[title] = redirect_title
            else:
                subjects = re.findall(r'\[\[Category:(.*?)\]\]', text)
                articles[title] = {"subjects": subjects, "intro": first_paragraph}

    elem.clear()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Text'])

    for title, data in articles.items():
        synonyms = [k for k, v in redirects.items() if v == title]
        outputWriter.writerow([title, synonyms, data["subjects"], data["intro"]])

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

Total runtime: 0:13:16.11


This code will create a CSV file named output.csv with the columns 'Article title', 'Synonyms', 'Subjects', and 'Text'. 

The 'Synonyms' column will contain the titles that redirect to the article, the 'Subjects' column will contain the list of 

subject tags referred to in the article, and the 'Text' column will contain the first introductory paragraph of the article.

ChatGPT:
I apologize for the oversight. It appears that there might be an issue with parsing the XML data. We can add a check to ensure that we have valid data before attempting the conversion. Please update the following line:

    ```python
    ns = int(elem.findtext('{*}ns'))
    ```

to

    ```python
    ns_text = elem.findtext('{*}ns')
    ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
    ```

This should resolve the error by ensuring that we only attempt to convert the namespace value if it is not None and contains digits.

the solution for next error:

    ValueError                                Traceback (most recent call last)
    Cell In[5], line 45
        43 ns_text = elem.findtext('{*}ns')
        44 ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
    ---> 45 id = int(elem.findtext('{*}id'))
        46 redirect = elem.find('{*}redirect')
        47 text_elem = elem.find('{*}revision/{*}text')

    ValueError: invalid literal for int() with base 10: ''

Apologies for the oversight. To resolve this error, let's update the code to handle cases where the 'id' element is missing or empty:

```python
id_text = elem.findtext('{*}id')
id = int(id_text) if id_text is not None and id_text.isdigit() else None
```

Please replace the line 45 with the above code snippet, and try running the code again.

Fixing the next error:

the code finished run, but the output.csv file is empty, nothing is saved into it, can you fix that?

My apologies for the issue. It seems like the code block that writes to the CSV file is missing. I have modified the code below, adding the necessary code to write the output to the CSV file:



Please replace your code with this updated version and run it again. This should create an 'output.csv' file containing the extracted data. 

Keep in mind that you may need to adjust the 'Subjects' extraction logic to suit your specific requirements, as I left it as an empty list placeholder for now.

In [7]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Text'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            title = elem.findtext('{*}title')
            ns_text = elem.findtext('{*}ns')
            ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
            id_text = elem.findtext('{*}id')
            id = int(id_text) if id_text is not None and id_text.isdigit() else None
            redirect = elem.find('{*}redirect')
            text_elem = elem.find('{*}revision/{*}text')

            if ns == 0 and id is not None and text_elem is not None:
                synonyms = [redirect.get('title')] if redirect is not None else []
                subjects = []  # Add logic to extract subjects from the text
                text = text_elem.text.split('\n')[0]  # Extract the first line of the article text

                outputWriter.writerow([title, synonyms, subjects, text])

            elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")


Total runtime: 0:14:56.13


In [4]:
import pandas as pd
df_output=pd.read_csv('data/output-1.csv')


In [10]:
df_output.head(50)

Unnamed: 0,Article title,Synonyms,Subjects,Text
0,AccessibleComputing,['Computer accessibility'],[],#REDIRECT [[Computer accessibility]]
1,Anarchism,[],[],{{short description|Political philosophy and m...
2,AfghanistanHistory,['History of Afghanistan'],[],#REDIRECT [[History of Afghanistan]]
3,AfghanistanGeography,['Geography of Afghanistan'],[],#REDIRECT [[Geography of Afghanistan]]
4,AfghanistanPeople,['Demographics of Afghanistan'],[],#REDIRECT [[Demographics of Afghanistan]]
5,AfghanistanCommunications,['Communications in Afghanistan'],[],#REDIRECT [[Communications in Afghanistan]]
6,AfghanistanTransportations,['Transport in Afghanistan'],[],#REDIRECT [[Transport in Afghanistan]]
7,AfghanistanMilitary,['Afghan Armed Forces'],[],#REDIRECT [[Afghan Armed Forces]]
8,AfghanistanTransnationalIssues,['Foreign relations of Afghanistan'],[],#REDIRECT [[Foreign relations of Afghanistan]]
9,AssistiveTechnology,['Assistive technology'],[],#REDIRECT [[Assistive_technology]]


In [6]:
df_output.iloc[1]

Article title                                            Anarchism
Synonyms                                                        []
Subjects                                                        []
Text             {{short description|Political philosophy and m...
Name: 1, dtype: object

In [11]:
df_output.iloc[17].Text

'{{Short description|Ratio of how much light is reflected back from a body}}'

Hoss: let's improve code to extract more content

ChatGPT: 

To extract more content from the Wikipedia dump, let's modify the code to include the following improvements:

* Extract the whole introduction instead of just the first line.
* Extract all the links from the introduction and store them as a list.
* Extract subjects using the 'Category' links in the text.
* Here's the updated code:

In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def extract_intro(text):
    intro = ''
    for line in text.split('\n'):
        if line.startswith('=='):
            break
        intro += line + '\n'
    return intro.strip()

def extract_links(text):
    return re.findall(r'\[\[([^\]]+)\]\]', text)

def extract_categories(links):
    return [link.split(':', 1)[1] for link in links if link.startswith('Category:')]

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Text', 'Links'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            title = elem.findtext('{*}title')
            ns_text = elem.findtext('{*}ns')
            ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
            id_text = elem.findtext('{*}id')
            id = int(id_text) if id_text is not None and id_text.isdigit() else None
            redirect = elem.find('{*}redirect')
            text_elem = elem.find('{*}revision/{*}text')

            if ns == 0 and id is not None and text_elem is not None:
                synonyms = [redirect.get('title')] if redirect is not None else []
                text = extract_intro(text_elem.text)
                links = extract_links(text)
                subjects = extract_categories(links)

                outputWriter.writerow([title, synonyms, subjects, text, links])

            elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")


Total runtime: 0:19:31.50


This updated code includes functions to extract the whole introduction, all links from the introduction, and subjects using the 'Category' links in the text. The 'output.csv' file will now contain the additional extracted content.

In [4]:
import pandas as pd
df_output2=pd.read_csv('data/output.csv')

In [5]:
df_output2.head(5)

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links
0,AccessibleComputing,['Computer accessibility'],[],#REDIRECT [[Computer accessibility]]\n\n{{rcat...,['Computer accessibility']
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m..."
2,AfghanistanHistory,['History of Afghanistan'],[],#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,['History of Afghanistan']
3,AfghanistanGeography,['Geography of Afghanistan'],[],#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,['Geography of Afghanistan']
4,AfghanistanPeople,['Demographics of Afghanistan'],[],#REDIRECT [[Demographics of Afghanistan]]\n\n{...,['Demographics of Afghanistan']


In [7]:
df_output2.iloc[0].Text

'#REDIRECT [[Computer accessibility]]\n\n{{rcat shell|\n{{R from move}}\n{{R from CamelCase}}\n{{R unprintworthy}}\n}}'

In [9]:
print(df_output2.iloc[1].Text)

{{short description|Political philosophy and movement}}
{{other uses}}
{{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}}
{{distinguish|Anarchy}}
{{good article}}
{{pp-semi-indef}}
{{use British English|date=August 2021}}
{{use dmy dates|date=August 2021}}
{{anarchism sidebar}}
{{basic forms of government}}

'''Anarchism''' is a [[political philosophy]] and [[Political movement|movement]] that is skeptical of all justifications for [[authority]] and seeks to abolish the [[institutions]] it claims maintain unnecessary [[coercion]] and [[Social hierarchy|hierarchy]], typically including, though not necessarily limited to, [[Government|governments]], [[State (polity)|nation states]],{{sfn|Suissa|2019b|ps=: "...as many anarchists have stressed, it is not government as such that they find objectionable, but the hierarchical forms of government associated with the nation state."}} and [[capitalism]]. Anarchism advocates for the replacement of the state with [[Stateless 

In [11]:
df_output2.iloc[1].Links

"['political philosophy', 'Political movement|movement', 'authority', 'institutions', 'coercion', 'Social hierarchy|hierarchy', 'Government|governments', 'State (polity)|nation states', 'capitalism', 'Stateless society|stateless societies', 'Free association (communism and anarchism)|free associations', 'left-wing', 'Far-left politics|farthest left', 'political spectrum', 'Communalism (Bookchin)|communalism', 'libertarian Marxism', 'libertarian', 'libertarian socialism', 'socialist movement', 'Humans', 'society|societies', 'realm', 'empire', 'scepticism', 'Age of Enlightenment|Enlightenment', 'emancipation', 'anarchist schools of thought', 'Paris Commune', 'Russian Civil War', 'Spanish Civil War', 'classical era of anarchism', 'anti-capitalist', 'anti-war', 'anti-globalisation', 'diversity of tactics', 'Praxis (process)|praxis']"

In [14]:
df_output2.iloc[1].Synonyms

'[]'

In [15]:
df_output2_fileterd=df_output2[df_output2.Synonyms=='[]']

In [17]:
df_output2.shape

(17112587, 5)

In [16]:
df_output2_fileterd.shape

(6637947, 5)

In [18]:
df_output2_fileterd.head(50)

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m..."
17,Albedo,[],[],{{Short description|Ratio of how much light is...,['File:Albedo-e hg.svg|thumb|upright=1.3|The p...
58,A,[],[],{{Short description|First letter of the Latin ...,"['Latin alpha|ɑ', 'A#Related characters|See be..."
65,Alabama,[],[],{{Short description|U.S. state}}\n{{about|the ...,"['Northern flicker|Yellowhammer', 'Dixie', 'Au..."
67,Achilles,[],[],{{short description|Greek mythological hero}}\...,['File:Achilles fighting against Memnon Leiden...
69,Abraham Lincoln,[],[],{{Short description|President of the United St...,['Alexander Gardner (photographer)|Alexander G...
70,Aristotle,[],[],{{Short description|Ancient Greek philosopher ...,"['Lysippos', 'Stagira (ancient city)|Stagira',..."
71,An American in Paris,[],[],{{short description|Symphonic tone poem by Geo...,"['George Gershwin', 'Orchestral jazz', 'Rhapso..."
72,Academy Award for Best Production Design,[],[],{{Short description|Academy Award which recogn...,['Academy of Motion Picture Arts and Sciences'...
73,Academy Awards,[],[],{{short description|Annual awards for cinemati...,['Academy of Motion Picture Arts and Sciences'...


I want you to adjust the 'Subjects' extraction logic to extract the  skills category or emerging skills or new skilss that mentioned in the text, at the moment it left blank as a empty list placeholder for now.

I understand that you want to extract specific categories related to skills from the text. To achieve this, you can modify the extract_categories function. First, you need to create a list of keywords related to the skills you want to extract (e.g., "Skills", "Emerging skills", "New skills"). Then, you can check if any of these keywords are present in the categories extracted from the text. Here's an updated version of the extract_categories function to accomplish this:

In [None]:
def extract_categories(links):
    categories = [link.split(':', 1)[1] for link in links if link.startswith('Category:')]
    skill_keywords = ["Skills", "Emerging skills", "New skills"]
    skill_categories = [cat for cat in categories if any(keyword in cat for keyword in skill_keywords)]
    return skill_categories


This modified function will now only return the categories that contain any of the keywords specified in the skill_keywords list. Please replace the existing extract_categories function in your code with this updated version. This should help you extract skill-related categories from the Wikipedia XML.

i want to keep the current category and add above as extra column

I see, you want to keep the current 'Subjects' column and add a new column for the skill-related categories. You can achieve this by modifying the output CSV structure and writing the skill categories to a new column. Here's the modified code:

1-Update the CSV header to include the new column:

In [None]:
outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Skill Categories', 'Text', 'Links'])


2-Modify the loop where you write the data to the CSV file to include the new column:

In [None]:
for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
    if elem.tag.endswith('page'):
        title = elem.findtext('{*}title')
        ns_text = elem.findtext('{*}ns')
        ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
        id_text = elem.findtext('{*}id')
        id = int(id_text) if id_text is not None and id_text.isdigit() else None
        redirect = elem.find('{*}redirect')
        text_elem = elem.find('{*}revision/{*}text')

        if ns == 0 and id is not None and text_elem is not None:
            synonyms = [redirect.get('title')] if redirect is not None else []
            text = extract_intro(text_elem.text)
            links = extract_links(text)
            all_categories = extract_categories(links)
            skill_categories = extract_skill_categories(links)

            outputWriter.writerow([title, synonyms, all_categories, skill_categories, text, links])

        elem.clear()


3-Add the extract_skill_categories function to your code:

In [None]:
def extract_skill_categories(links):
    categories = [link.split(':', 1)[1] for link in links if link.startswith('Category:')]
    skill_keywords = ["Skills", "Emerging skills", "New skills"]
    skill_categories = [cat for cat in categories if any(keyword in cat for keyword in skill_keywords)]
    return skill_categories


Now, your output CSV file will have an additional column for skill-related categories. The 'Subjects' column will still contain all categories, and the new 'Skill Categories' column will contain only skill-related categories.

----

#### the updated code so far:

In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output-2.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def extract_intro(text):
    intro = ''
    for line in text.split('\n'):
        if line.startswith('=='):
            break
        intro += line + '\n'
    return intro.strip()

def extract_links(text):
    return re.findall(r'\[\[([^\]]+)\]\]', text)

def extract_categories(links):
    return [link.split(':', 1)[1] for link in links if link.startswith('Category:')]

def extract_skill_categories(links):
    categories = [link.split(':', 1)[1] for link in links if link.startswith('Category:')]
    skill_keywords = ["Skills", "Emerging skills", "New skills"]
    skill_categories = [cat for cat in categories if any(keyword in cat for keyword in skill_keywords)]
    return skill_categories


pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Text', 'Links'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            title = elem.findtext('{*}title')
            ns_text = elem.findtext('{*}ns')
            ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
            id_text = elem.findtext('{*}id')
            id = int(id_text) if id_text is not None and id_text.isdigit() else None
            redirect = elem.find('{*}redirect')
            text_elem = elem.find('{*}revision/{*}text')

            if ns == 0 and id is not None and text_elem is not None:
                synonyms = [redirect.get('title')] if redirect is not None else []
                text = extract_intro(text_elem.text)
                links = extract_links(text)
                all_categories = extract_categories(links)
                skill_categories = extract_skill_categories(links)

                outputWriter.writerow([title, synonyms, all_categories, skill_categories, text, links])

            elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")


Total runtime: 0:19:44.57


In [3]:
import pandas as pd
df_output3=pd.read_csv('data/output-2.csv')

In [4]:
df_output3.shape

(17112587, 5)

In [5]:
df_output3.head(5)

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links
AccessibleComputing,['Computer accessibility'],[],[],#REDIRECT [[Computer accessibility]]\n\n{{rcat...,['Computer accessibility']
Anarchism,[],[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m..."
AfghanistanHistory,['History of Afghanistan'],[],[],#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,['History of Afghanistan']
AfghanistanGeography,['Geography of Afghanistan'],[],[],#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,['Geography of Afghanistan']
AfghanistanPeople,['Demographics of Afghanistan'],[],[],#REDIRECT [[Demographics of Afghanistan]]\n\n{...,['Demographics of Afghanistan']


In [6]:
df_output3[df_output3.Subjects!='[]']

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links
Ministry of Skills Development (Ontario),['Ministry of Colleges and Universities'],['Ontario government departments and agencies|...,['Ontario government departments and agencies|...,#REDIRECT [[Ministry of Colleges and Universit...,"['Ministry of Colleges and Universities', 'Cat..."
ConstructionSkills,['CITB'],['Sector Skills Councils'],['Sector Skills Councils'],#REDIRECT [[CITB]]\n\n[[Category:Sector Skills...,"['CITB', 'Category:Sector Skills Councils']"
Voluntary trades,['Compulsory trade'],"['Canadian labour law', 'Skills']",['Skills'],#REDIRECT [[Compulsory trade]] {{R from merge}...,"['Compulsory trade', 'Category:Canadian labour..."
"Minister for Regional NSW, Skills and Small Business (New South Wales)",['Minister for Regional New South Wales'],['Defunct government positions of New South Wa...,['Defunct government positions of New South Wa...,#REDIRECT [[Minister for Regional New South Wa...,"['Minister for Regional New South Wales', 'Cat..."
Minister for Skills and Tertiary Education (New South Wales),['Minister for Skills and Training (New South ...,['Defunct government positions of New South Wa...,['Defunct government positions of New South Wa...,#REDIRECT [[Minister for Skills and Training (...,['Minister for Skills and Training (New South ...
Minister for Skills (New South Wales),['Minister for Skills and Training (New South ...,['Defunct government positions of New South Wa...,['Defunct government positions of New South Wa...,#REDIRECT [[Minister for Skills and Training (...,['Minister for Skills and Training (New South ...


In [7]:
df_output3_fileterd=df_output3[df_output3.Synonyms=='[]']

In [8]:
df_output3_fileterd.shape

(16663899, 5)

In [9]:
df_output3_fileterd.head(5)

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links
AccessibleComputing,['Computer accessibility'],[],[],#REDIRECT [[Computer accessibility]]\n\n{{rcat...,['Computer accessibility']
Anarchism,[],[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m..."
AfghanistanHistory,['History of Afghanistan'],[],[],#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,['History of Afghanistan']
AfghanistanGeography,['Geography of Afghanistan'],[],[],#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,['Geography of Afghanistan']
AfghanistanPeople,['Demographics of Afghanistan'],[],[],#REDIRECT [[Demographics of Afghanistan]]\n\n{...,['Demographics of Afghanistan']


the ending of each page is like below

In [None]:
# [[Category:Anarchism| ]]
# [[Category:Anti-capitalism]]
# [[Category:Anti-fascism]]
# [[Category:Economic ideologies]]
# [[Category:Far-left politics]]
# [[Category:Left-wing politics]]
# [[Category:Libertarian socialism]]
# [[Category:Libertarianism]]
# [[Category:Political culture]]
# [[Category:Political ideologies]]
# [[Category:Political movements]]
# [[Category:Social theories]]
# [[Category:Socialism]]</text>
#       <sha1>htrz2d8b3y9uex85ob2tpxbelnf3hwz</sha1>
#     </revision>
#   </page>

Thank you for providing the ending of a page. I will modify the code to extract the categories, and add an additional column for the new categories you mentioned (skills category, emerging skills, or new skills). We can use a regular expression to search for these categories in the extracted categories.

Here's the updated code:

In [10]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output-3.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def extract_intro(text):
    intro = ''
    for line in text.split('\n'):
        if line.startswith('=='):
            break
        intro += line + '\n'
    return intro.strip()

def extract_links(text):
    return re.findall(r'\[\[([^\]]+)\]\]', text)

def extract_categories(links):
    return [link.split(':', 1)[1] for link in links if link.startswith('Category:')]

def extract_new_categories(categories):
    new_categories = []
    for category in categories:
        if re.search(r'(skill|emerging|new)', category, re.IGNORECASE):
            new_categories.append(category)
    return new_categories

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Text', 'Links', 'New Categories'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            title = elem.findtext('{*}title')
            ns_text = elem.findtext('{*}ns')
            ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
            id_text = elem.findtext('{*}id')
            id = int(id_text) if id_text is not None and id_text.isdigit() else None
            redirect = elem.find('{*}redirect')
            text_elem = elem.find('{*}revision/{*}text')

            if ns == 0 and id is not None and text_elem is not None:
                synonyms = [redirect.get('title')] if redirect is not None else []
                text = extract_intro(text_elem.text)
                links = extract_links(text)
                subjects = extract_categories(links)
                new_categories = extract_new_categories(subjects)

                outputWriter.writerow([title, synonyms, subjects, text, links, new_categories])

            elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")


Total runtime: 0:26:08.07


In [11]:
import pandas as pd
df_output4=pd.read_csv('data/output-3.csv')

In [12]:
df_output4.shape

(17112587, 6)

In [13]:
df_output4.head(5)

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,New Categories
0,AccessibleComputing,['Computer accessibility'],[],#REDIRECT [[Computer accessibility]]\n\n{{rcat...,['Computer accessibility'],[]
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[]
2,AfghanistanHistory,['History of Afghanistan'],[],#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,['History of Afghanistan'],[]
3,AfghanistanGeography,['Geography of Afghanistan'],[],#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,['Geography of Afghanistan'],[]
4,AfghanistanPeople,['Demographics of Afghanistan'],[],#REDIRECT [[Demographics of Afghanistan]]\n\n{...,['Demographics of Afghanistan'],[]


In [16]:
df_output4.columns

Index(['Article title', 'Synonyms', 'Subjects', 'Text', 'Links',
       'New Categories'],
      dtype='object')

In [17]:
df_output4[df_output4['New Categories']!='[]']

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,New Categories
7274,Fethry Duck,['Duck family (Disney)'],"['Animal superheroes', 'Anthropomorphic animal...",#REDIRECT [[Duck family (Disney)#Fethry Duck]]...,"['Duck family (Disney)#Fethry Duck', 'Category...",['New Age in popular culture']
8175,Gideon McDuck,['Clan McDuck'],"['Disney comics characters', 'Fictional charac...",#REDIRECT [[Clan McDuck#Gideon McDuck]]\n\n{{R...,"['Clan McDuck#Gideon McDuck', 'Category:Disney...",['Fictional newspaper editors']
9418,"International Tropical Timber Agreement, 1983",['International Tropical Timber Agreement'],"['Environmental treaties', 'Treaties concluded...",#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...","['Treaties of New Zealand', 'Treaties of Papua..."
9419,"International Tropical Timber Agreement, 1994",['International Tropical Timber Agreement'],"['Environmental treaties', 'Treaties concluded...",#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...","['Treaties of New Zealand', 'Treaties of Papua..."
13744,History of New Caledonia,['New Caledonia'],['National histories|New Caledonia'],#REDIRECT [[New Caledonia#History]]\n\n{{R to ...,"['New Caledonia#History', 'Category:National h...",['National histories|New Caledonia']
...,...,...,...,...,...,...
17111324,Mark Woodward (mountaineer),['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne..."
17111325,Mike Roberts (mountaineer),['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne..."
17111327,Dean Staples,['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne..."
17111328,Mark Whetu,['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne..."


In [18]:
df_output4[df_output4.Subjects!='[]']

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,New Categories
11,Autism,['Autism spectrum'],['1910s neologisms'],#REDIRECT [[Autism spectrum]]\n{{rcat shell|\n...,"['Autism spectrum', 'Category:1910s neologisms']",[]
333,Abdul Alhazred,['Necronomicon'],"['Characters in short stories', 'Fictional alc...",#Redirect [[Necronomicon#Fictional history]]\n...,"['Necronomicon#Fictional history', 'Category:C...",[]
506,Argot,['Cant (language)'],"['Cant languages', 'Folklore', 'Linguistics te...",#REDIRECT [[Cant (language)#Argot]]\n{{Redirec...,"['Cant (language)#Argot', 'Category:Cant langu...",[]
514,Economy of Antarctica,['Antarctica'],"['Economy of Antarctica', 'Economies by contin...",#REDIRECT [[Antarctica#Economic_activity_and_t...,"['Antarctica#Economic_activity_and_tourism', '...",[]
691,Agasias,[],['Greek masculine given names'],'''Agasias''' was the name of several differen...,"['Agasias of Arcadia', 'Borghese Gladiator', '...",[]
...,...,...,...,...,...,...
17112034,Amidst the Chaos: Live from the Hollywood Bowl,['Sara Bareilles discography'],"['Sara Bareilles live albums', '2021 live albu...",#REDIRECT [[Sara_Bareilles_discography#Live_al...,"['Sara_Bareilles_discography#Live_albums', 'Ca...",[]
17112124,Khvicha,[],['Georgian masculine given names'],'''Khvicha''' ({{lang-ka|ხვიჩა}}) is a masculi...,"['Georgian given name', 'Khvicha Kvaratskhelia...",[]
17112194,Moor Allerton Golf Club,['Moor Allerton'],['Golf clubs and courses in Yorkshire'],#REDIRECT [[Moor Allerton#Moor Allerton Golf C...,"['Moor Allerton#Moor Allerton Golf Club', 'Cat...",[]
17112221,Kaylea Titford,['Killing of Kaylea Titford'],"['People with hydrocephalus', 'People with spi...",#REDIRECT [[Killing of Kaylea Titford]]\n\n\n[...,"['Killing of Kaylea Titford', 'Category:People...",[]


In [19]:
df_output4_fileterd=df_output4[df_output4.Synonyms=='[]']

In [20]:
df_output4_fileterd.shape

(6637947, 6)

In [21]:
df_output4_fileterd.head(5)

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,New Categories
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[]
17,Albedo,[],[],{{Short description|Ratio of how much light is...,['File:Albedo-e hg.svg|thumb|upright=1.3|The p...,[]
58,A,[],[],{{Short description|First letter of the Latin ...,"['Latin alpha|ɑ', 'A#Related characters|See be...",[]
65,Alabama,[],[],{{Short description|U.S. state}}\n{{about|the ...,"['Northern flicker|Yellowhammer', 'Dixie', 'Au...",[]
67,Achilles,[],[],{{short description|Greek mythological hero}}\...,['File:Achilles fighting against Memnon Leiden...,[]


This updated code includes an additional function extract_new_categories that extracts the new categories you're interested in. It also adds an extra column to the CSV output called "New Categories" to store the extracted new categories.

i also wanted you to extra column and extract the category and put it in that column, keep the def extract_skill_categories( as per your previous suggestion)

In [22]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output-4.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def extract_intro(text):
    intro = ''
    for line in text.split('\n'):
        if line.startswith('=='):
            break
        intro += line + '\n'
    return intro.strip()

def extract_links(text):
    return re.findall(r'\[\[([^\]]+)\]\]', text)

def extract_categories(links):
    return [link.split(':', 1)[1] for link in links if link.startswith('Category:')]

def extract_skill_categories(categories):
    skill_categories = []
    for category in categories:
        if re.search(r'(skill|emerging|new)', category, re.IGNORECASE):
            skill_categories.append(category)
    return skill_categories

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Text', 'Links', 'Categories', 'Skill Categories'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            title = elem.findtext('{*}title')
            ns_text = elem.findtext('{*}ns')
            ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
            id_text = elem.findtext('{*}id')
            id = int(id_text) if id_text is not None and id_text.isdigit() else None
            redirect = elem.find('{*}redirect')
            text_elem = elem.find('{*}revision/{*}text')

            if ns == 0 and id is not None and text_elem is not None:
                synonyms = [redirect.get('title')] if redirect is not None else []
                text = extract_intro(text_elem.text)
                links = extract_links(text)
                categories = extract_categories(links)
                skill_categories = extract_skill_categories(categories)

                outputWriter.writerow([title, synonyms, subjects, text, links, categories, skill_categories])

            elem.clear()

time_took = time.time()


This updated code keeps the existing categories in the "Categories" column and adds the skill-related categories in a separate column called "Skill Categories".

In [23]:
import pandas as pd
df_output5=pd.read_csv('data/output-4.csv')


In [24]:
df_output5.shape


(17112587, 7)

In [25]:
df_output5.head(5)


Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,Categories,Skill Categories
0,AccessibleComputing,['Computer accessibility'],[],#REDIRECT [[Computer accessibility]]\n\n{{rcat...,['Computer accessibility'],[],[]
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[],[]
2,AfghanistanHistory,['History of Afghanistan'],[],#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,['History of Afghanistan'],[],[]
3,AfghanistanGeography,['Geography of Afghanistan'],[],#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,['Geography of Afghanistan'],[],[]
4,AfghanistanPeople,['Demographics of Afghanistan'],[],#REDIRECT [[Demographics of Afghanistan]]\n\n{...,['Demographics of Afghanistan'],[],[]


In [28]:
df_output5.columns

Index(['Article title', 'Synonyms', 'Subjects', 'Text', 'Links', 'Categories',
       'Skill Categories'],
      dtype='object')

In [26]:
df_output5[df_output5.Subjects!='[]']


Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,Categories,Skill Categories


In [29]:
df_output5[df_output5['Skill Categories']!='[]']

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,Categories,Skill Categories
7274,Fethry Duck,['Duck family (Disney)'],[],#REDIRECT [[Duck family (Disney)#Fethry Duck]]...,"['Duck family (Disney)#Fethry Duck', 'Category...","['Animal superheroes', 'Anthropomorphic animal...",['New Age in popular culture']
8175,Gideon McDuck,['Clan McDuck'],[],#REDIRECT [[Clan McDuck#Gideon McDuck]]\n\n{{R...,"['Clan McDuck#Gideon McDuck', 'Category:Disney...","['Disney comics characters', 'Fictional charac...",['Fictional newspaper editors']
9418,"International Tropical Timber Agreement, 1983",['International Tropical Timber Agreement'],[],#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...","['Environmental treaties', 'Treaties concluded...","['Treaties of New Zealand', 'Treaties of Papua..."
9419,"International Tropical Timber Agreement, 1994",['International Tropical Timber Agreement'],[],#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...","['Environmental treaties', 'Treaties concluded...","['Treaties of New Zealand', 'Treaties of Papua..."
13744,History of New Caledonia,['New Caledonia'],[],#REDIRECT [[New Caledonia#History]]\n\n{{R to ...,"['New Caledonia#History', 'Category:National h...",['National histories|New Caledonia'],['National histories|New Caledonia']
...,...,...,...,...,...,...,...
17111324,Mark Woodward (mountaineer),['List of Mount Everest summiters by frequency'],[],#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne...","['New Zealand summiters of Mount Everest', 'Ne..."
17111325,Mike Roberts (mountaineer),['List of Mount Everest summiters by frequency'],[],#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne...","['New Zealand summiters of Mount Everest', 'Ne..."
17111327,Dean Staples,['List of Mount Everest summiters by frequency'],[],#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne...","['New Zealand summiters of Mount Everest', 'Ne..."
17111328,Mark Whetu,['List of Mount Everest summiters by frequency'],[],#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,"['New Zealand summiters of Mount Everest', 'Ne...","['New Zealand summiters of Mount Everest', 'Ne..."


In [27]:
df_output5_fileterd=df_output5[df_output5.Synonyms=='[]']


In [30]:
df_output5_fileterd.shape


(6637947, 7)

In [31]:
df_output5_fileterd.head(5)

Unnamed: 0,Article title,Synonyms,Subjects,Text,Links,Categories,Skill Categories
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[],[]
17,Albedo,[],[],{{Short description|Ratio of how much light is...,['File:Albedo-e hg.svg|thumb|upright=1.3|The p...,[],[]
58,A,[],[],{{Short description|First letter of the Latin ...,"['Latin alpha|ɑ', 'A#Related characters|See be...",[],[]
65,Alabama,[],[],{{Short description|U.S. state}}\n{{about|the ...,"['Northern flicker|Yellowhammer', 'Dixie', 'Au...",[],[]
67,Achilles,[],[],{{short description|Greek mythological hero}}\...,['File:Achilles fighting against Memnon Leiden...,[],[]


still one item left that I need you to extract it, please wait for my instruction

the example of XML file for one title is as below:

In [None]:
#   <page>
#     <title>Anarchism</title>
#     <ns>0</ns>
#     <id>12</id>
#     <revision>
#       <id>1146698696</id>
#       <parentid>1146666587</parentid>
#       <timestamp>2023-03-26T12:58:48Z</timestamp>
#       <contributor>
#         <username>C.J. Griffin</username>
#         <id>3341545</id>
#       </contributor>
#       <comment>Undid revision 1146666587 by [[Special:Contributions/MallonsTalons|MallonsTalons]] ([[User talk:MallonsTalons|talk]]) Unnecessary given this is discussed in the third sentence of the same paragraph</comment>
#       <model>wikitext</model>
#       <format>text/x-wiki</format>
#       <text bytes="108975" xml:space="preserve">{{short description|Political philosophy and movement}}
# {{other uses}}
# {{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}}
# {{distinguish|Anarchy}}
# {{good article}}
# {{pp-semi-indef}}
# {{use British English|date=August 2021}}
# {{use dmy dates|date=August 2021}}
# {{anarchism sidebar}}
# {{basic forms of government}}

# '''Anarchism''' is a [[political philosophy]] and [[Political movement|movement]] that is skeptical of all justifications for [[authority]] and seeks to abolish the [[institutions]] it claims 
# == External links ==
# &lt;!-- Attention! The external link portion of this article regularly grows far beyond manageable size. PLEASE only list an outside link if it applies to anarchism in general, is somewhat noteworthy, and has consensus on the talkpage. Links to sites which cover anarchist submovements will be routinely moved to subarticles to keep this article free of clutter. --&gt;
# {{prone to spam|date=November 2014}}
# &lt;!-- {{no more links}}. Please be cautious adding more external links. Wikipedia is not a collection of links and should not be used for advertising. Excessive or inappropriate links will be removed. See [[Wikipedia:External links]] and [[Wikipedia:Spam]] for details. If there are already suitable links, propose additions or replacements on the article's talk page, or submit your link to the relevant category at Curlie (curlie.org) – and link there using {{curlie}}. --&gt;
# * {{Curlie|Society/Politics/Anarchism|Anarchism}}
# * [http://dwardmac.pitzer.edu/ Anarchy Archives]. [[Anarchy Archives]] is an online research center on the history and theory of anarchism.

# {{sister bar|wikt=yes|commons=yes|q=yes|s=yes|b=yes|d=yes|v=yes|n=yes}}
# {{anarchism}}
# {{anarchies}}
# {{libertarian socialism}}
# {{libertarianism}}
# {{philosophy topics}}
# {{political culture}}
# {{political ideologies}}
# {{social and political philosophy}}
# {{authority control}}

# [[Category:Anarchism| ]]
# [[Category:Anti-capitalism]]
# [[Category:Anti-fascism]]
# [[Category:Economic ideologies]]
# [[Category:Far-left politics]]
# [[Category:Left-wing politics]]
# [[Category:Libertarian socialism]]
# [[Category:Libertarianism]]
# [[Category:Political culture]]
# [[Category:Political ideologies]]
# [[Category:Political movements]]
# [[Category:Social theories]]
# [[Category:Socialism]]</text>
#       <sha1>htrz2d8b3y9uex85ob2tpxbelnf3hwz</sha1>
#     </revision>
#   </page>


Thank you for providing the example XML file. Based on this example, I will add another column to extract and store the 'External links' section in the output CSV file.

Here is the updated code with the added functionality:

In [None]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re


def extract_intro(text):
    # Your existing code
    ...


def extract_links(text):
    # Your existing code
    ...


def extract_categories(links):
    # Your existing code
    ...


def extract_skill_categories(links):
    # Your existing code
    ...


def extract_external_links(text):
    external_links_section = re.search(r'==\s*External links\s*==(.*)', text, re.DOTALL)
    if external_links_section:
        external_links = re.findall(r'\* (http[^\s]+)', external_links_section.group(1))
        return external_links
    return []


pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Skills', 'Text', 'Links', 'External Links'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            # Your existing code
            
            if ns == 0 and id is not None and text_elem is not None:
                # Your existing code

                external_links = extract_external_links(text_elem.text)
                outputWriter.writerow([title, synonyms, subjects, skills, text, links, external_links])

            elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")


This code now extracts the 'External links' section and stores it in a new 'External Links' column in the output CSV file.

In [2]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output-4.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def extract_intro(text):
    intro = ''
    for line in text.split('\n'):
        if line.startswith('=='):
            break
        intro += line + '\n'
    return intro.strip()

def extract_links(text):
    return re.findall(r'\[\[([^\]]+)\]\]', text)

def extract_categories(links):
    return [link.split(':', 1)[1] for link in links if link.startswith('Category:')]

def extract_skill_categories(categories):
    skill_categories = []
    for category in categories:
        if re.search(r'(skill|emerging|new)', category, re.IGNORECASE):
            skill_categories.append(category)
    return skill_categories

def extract_external_links(text):
    external_links_section = re.search(r'==\s*External links\s*==(.*)', text, re.DOTALL)
    if external_links_section:
        external_links = re.findall(r'\* (http[^\s]+)', external_links_section.group(1))
        return external_links
    return []

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article_title', 'Synonyms', 'Skills', 'Text', 'Links', 'External_Links'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            title = elem.findtext('{*}title')
            ns_text = elem.findtext('{*}ns')
            ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
            id_text = elem.findtext('{*}id')
            id = int(id_text) if id_text is not None and id_text.isdigit() else None
            redirect = elem.find('{*}redirect')
            text_elem = elem.find('{*}revision/{*}text')

            if ns == 0 and id is not None and text_elem is not None:
                synonyms = [redirect.get('title')] if redirect is not None else []
                text = extract_intro(text_elem.text)
                links = extract_links(text)
                categories = extract_categories(links)
                skills = extract_skill_categories(categories)
                external_links = extract_external_links(text_elem.text)
                outputWriter.writerow([title, synonyms, skills, text, links, external_links])

            elem.clear()

time_took = time.time()


In [3]:
import pandas as pd
df_output6=pd.read_csv('data/output-4.csv')

In [4]:
df_output6.shape

(17112587, 6)

In [5]:
df_output6.head(5)

Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links
0,AccessibleComputing,['Computer accessibility'],[],#REDIRECT [[Computer accessibility]]\n\n{{rcat...,['Computer accessibility'],[]
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[]
2,AfghanistanHistory,['History of Afghanistan'],[],#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,['History of Afghanistan'],[]
3,AfghanistanGeography,['Geography of Afghanistan'],[],#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,['Geography of Afghanistan'],[]
4,AfghanistanPeople,['Demographics of Afghanistan'],[],#REDIRECT [[Demographics of Afghanistan]]\n\n{...,['Demographics of Afghanistan'],[]


In [6]:
df_output6.columns

Index(['Article_title', 'Synonyms', 'Skills', 'Text', 'Links',
       'External_Links'],
      dtype='object')

In [7]:
df_output6[df_output6.Skills!='[]']

Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links
7274,Fethry Duck,['Duck family (Disney)'],['New Age in popular culture'],#REDIRECT [[Duck family (Disney)#Fethry Duck]]...,"['Duck family (Disney)#Fethry Duck', 'Category...",[]
8175,Gideon McDuck,['Clan McDuck'],['Fictional newspaper editors'],#REDIRECT [[Clan McDuck#Gideon McDuck]]\n\n{{R...,"['Clan McDuck#Gideon McDuck', 'Category:Disney...",[]
9418,"International Tropical Timber Agreement, 1983",['International Tropical Timber Agreement'],"['Treaties of New Zealand', 'Treaties of Papua...",#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...",[]
9419,"International Tropical Timber Agreement, 1994",['International Tropical Timber Agreement'],"['Treaties of New Zealand', 'Treaties of Papua...",#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...",[]
13744,History of New Caledonia,['New Caledonia'],['National histories|New Caledonia'],#REDIRECT [[New Caledonia#History]]\n\n{{R to ...,"['New Caledonia#History', 'Category:National h...",[]
...,...,...,...,...,...,...
17111324,Mark Woodward (mountaineer),['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[]
17111325,Mike Roberts (mountaineer),['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[]
17111327,Dean Staples,['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[]
17111328,Mark Whetu,['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[]


In [8]:
df_output6[df_output6.External_Links!='[]']

Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links
413,Economy of American Samoa,[],[],{{Multiple issues|\n{{update|date=May 2020}}\n...,"['United States dollar|US dollar', 'List of co...",['http://www.classbrain.com/art_cr/publish/ame...
449,Astrometry,[],[],{{short description|Branch of astronomy involv...,['Image:Interferometric astrometry.jpg|thumb|r...,['http://www.space.com/scienceastronomy/quaoar...
804,Amathus,[],[],{{for multi|the ancient city in modern-day Jor...,"['wikt:Ἀμαθοῦς|Ἀμαθοῦς', 'Amathus sarcophagus'...",['http://www.mcw.gov.cy/mcw/DA/DA.nsf/0/D20ED5...
1412,Arbroath Abbey,[],[],{{Use dmy dates|date=April 2022}}\n{{Infobox c...,"['Arbroath', 'William I of Scotland|King Willi...",['https://web.archive.org/web/20140220180351/h...
1670,Algemeen Nijmeegs Studentenblad,[],[],{{Unreferenced|date=March 2015}}\n\nThe '''''A...,"['Student newspaper|student magazine', 'Radbou...",['http://www.ans-online.nl/']
...,...,...,...,...,...,...
17103119,Szilvia Bognár,[],[],{{Infobox person/Wikidata|fetchwikidata=ALL}}\...,['Zoltán Kodály'],['http://bognarszilvia.hu/']
17104175,Bicycle Days (2023 film),[],[],{{Use dmy dates|date=March 2023}}\n{{Infobox f...,['Hindi'],"['https://www.imdb.com/title/tt22436930/', 'ht..."
17105123,Independent Police Conduct Commission,[],[],{{Short description|Malaysian government agenc...,"['Government of Malaysia', 'Ministry of Home A...",['https://www.eaic.gov.my/en/']
17105890,Damian Mymryk,[],[],{{short description|Russian painter}}\n\n{{Inf...,"['Ternopil Oblast', 'Ukrainian SSR', 'Lviv', '...",['http://www.ukrvytynanka.com.ua/UA/lvivska-ob...


In [9]:
df_output6_fileterd=df_output6[df_output6.Synonyms=='[]']

In [10]:
df_output6_fileterd.shape

(6637947, 6)

In [11]:
df_output6_fileterd.head(5)

Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[]
17,Albedo,[],[],{{Short description|Ratio of how much light is...,['File:Albedo-e hg.svg|thumb|upright=1.3|The p...,[]
58,A,[],[],{{Short description|First letter of the Latin ...,"['Latin alpha|ɑ', 'A#Related characters|See be...",[]
65,Alabama,[],[],{{Short description|U.S. state}}\n{{about|the ...,"['Northern flicker|Yellowhammer', 'Dixie', 'Au...",[]
67,Achilles,[],[],{{short description|Greek mythological hero}}\...,['File:Achilles fighting against Memnon Leiden...,[]


I want you to extract the Category, for example for above example the Categories that this title is belong to is [Anarchism,Anti-capitalism,Anti-fascism,Economic ideologies,Far-left politics,Left-wing politics,Libertarian socialism,Libertarianism,Political culture,Political ideologies,Political movements,Social theories,Socialism]

I understand you want to extract the categories listed at the end of the text. Here's the modified code that adds an additional column called "Categories" to store these values:

In [None]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re


def extract_intro(text):
    # Your existing code
    ...


def extract_links(text):
    # Your existing code
    ...


def extract_categories(links):
    # Your existing code
    ...


def extract_skill_categories(links):
    # Your existing code
    ...


def extract_external_links(text):
    # Your existing code
    ...


def extract_article_categories(text):
    category_links = re.findall(r'\[\[Category:([^\]]+)\]\]', text)
    return category_links


pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article title', 'Synonyms', 'Subjects', 'Skills', 'Text', 'Links', 'External Links', 'Categories'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            # Your existing code
            
            if ns == 0 and id is not None and text_elem is not None:
                # Your existing code

                external_links = extract_external_links(text_elem.text)
                article_categories = extract_article_categories(text_elem.text)
                outputWriter.writerow([title, synonyms, subjects, skills, text, links, external_links, article_categories])

            elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")


This code now extracts the categories from the article and stores them in a new "Categories" column in the output CSV file.

In [12]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_OUTPUT = 'output-5.csv'
ENCODING = "utf-8"

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def extract_intro(text):
    intro = ''
    for line in text.split('\n'):
        if line.startswith('=='):
            break
        intro += line + '\n'
    return intro.strip()

def extract_links(text):
    return re.findall(r'\[\[([^\]]+)\]\]', text)

def extract_categories(links):
    return [link.split(':', 1)[1] for link in links if link.startswith('Category:')]

def extract_skill_categories(categories):
    skill_categories = []
    for category in categories:
        if re.search(r'(skill|emerging|new)', category, re.IGNORECASE):
            skill_categories.append(category)
    return skill_categories

def extract_external_links(text):
    external_links_section = re.search(r'==\s*External links\s*==(.*)', text, re.DOTALL)
    if external_links_section:
        external_links = re.findall(r'\* (http[^\s]+)', external_links_section.group(1))
        return external_links
    return []

def extract_article_categories(text):
    category_links = re.findall(r'\[\[Category:([^\]]+)\]\]', text)
    return category_links

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathOutput = os.path.join(PATH_WIKI_XML, FILENAME_OUTPUT)

start_time = time.time()

with codecs.open(pathOutput, "w", ENCODING) as outputFH:
    outputWriter = csv.writer(outputFH, quoting=csv.QUOTE_MINIMAL)
    outputWriter.writerow(['Article_title', 'Synonyms', 'Skills', 'Text', 'Links', 'External_Links', 'Categories'])

    for event, elem in etree.iterparse(pathWikiXML, events=('end',)):
        if elem.tag.endswith('page'):
            title = elem.findtext('{*}title')
            ns_text = elem.findtext('{*}ns')
            ns = int(ns_text) if ns_text is not None and ns_text.isdigit() else None
            id_text = elem.findtext('{*}id')
            id = int(id_text) if id_text is not None and id_text.isdigit() else None
            redirect = elem.find('{*}redirect')
            text_elem = elem.find('{*}revision/{*}text')

            if ns == 0 and id is not None and text_elem is not None:
                synonyms = [redirect.get('title')] if redirect is not None else []
                text = extract_intro(text_elem.text)
                links = extract_links(text)
                categories = extract_categories(links)
                skills = extract_skill_categories(categories)
                external_links = extract_external_links(text_elem.text)
                article_categories = extract_article_categories(text_elem.text)
                outputWriter.writerow([title, synonyms, skills, text, links, external_links, article_categories])

            elem.clear()

time_took = time.time()
print(f"Total runtime: {hms_string(time_took)}")


Total runtime: 467170:37:02.93


In [13]:
import pandas as pd
df_output7=pd.read_csv('data/output-5.csv')



In [14]:
df_output7.shape



(17112587, 7)

In [15]:
df_output7.head(5)



Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links,Categories
0,AccessibleComputing,['Computer accessibility'],[],#REDIRECT [[Computer accessibility]]\n\n{{rcat...,['Computer accessibility'],[],[]
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[],"['Anarchism| ', 'Anti-capitalism', 'Anti-fasci..."
2,AfghanistanHistory,['History of Afghanistan'],[],#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,['History of Afghanistan'],[],[]
3,AfghanistanGeography,['Geography of Afghanistan'],[],#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,['Geography of Afghanistan'],[],[]
4,AfghanistanPeople,['Demographics of Afghanistan'],[],#REDIRECT [[Demographics of Afghanistan]]\n\n{...,['Demographics of Afghanistan'],[],[]


In [16]:
df_output7.columns


Index(['Article_title', 'Synonyms', 'Skills', 'Text', 'Links',
       'External_Links', 'Categories'],
      dtype='object')

In [17]:
df_output7[df_output7.Categories!='[]']



Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links,Categories
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[],"['Anarchism| ', 'Anti-capitalism', 'Anti-fasci..."
11,Autism,['Autism spectrum'],[],#REDIRECT [[Autism spectrum]]\n{{rcat shell|\n...,"['Autism spectrum', 'Category:1910s neologisms']",[],['1910s neologisms']
17,Albedo,[],[],{{Short description|Ratio of how much light is...,['File:Albedo-e hg.svg|thumb|upright=1.3|The p...,[],"['Land surface effects on climate', 'Climate c..."
58,A,[],[],{{Short description|First letter of the Latin ...,"['Latin alpha|ɑ', 'A#Related characters|See be...",[],"['ISO basic Latin letters', 'Vowel letters']"
65,Alabama,[],[],{{Short description|U.S. state}}\n{{about|the ...,"['Northern flicker|Yellowhammer', 'Dixie', 'Au...",[],"['Alabama| ', '1819 establishments in the Unit..."
...,...,...,...,...,...,...,...
17112564,Ivory Coast women's national rugby union team,[],[],{{Infobox national rugby team\n| Name ...,"['Ivorian Rugby Federation', 'Ivory Coast', ""w...",[],"[""African national women's rugby union teams"",..."
17112572,Aminu Abdussalam Gwarzo,[],[],{{Short description|Nigerian politician (born ...,"['File:Comrade.jpg|thumb|', 'Abba Kabir Yusuf'...",[],"['1960 births', '21st-century Nigerian politic..."
17112573,Chahardi,[],[],"{{Short description|Village in Maharashtra, In...","['List of sovereign states|Country', 'States a...",[],['Villages in Jalgaon district']
17112580,Jews Against White Nationalism,[],[],{{Use mdy dates|date=April 2023}}\n{{Infobox o...,['Nonprofit organization|Nonprofit'],[],"['2019 establishments in the United States', '..."


In [18]:
df_output7[df_output7['Skills']!='[]']


Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links,Categories
7274,Fethry Duck,['Duck family (Disney)'],['New Age in popular culture'],#REDIRECT [[Duck family (Disney)#Fethry Duck]]...,"['Duck family (Disney)#Fethry Duck', 'Category...",[],"['Animal superheroes', 'Anthropomorphic animal..."
8175,Gideon McDuck,['Clan McDuck'],['Fictional newspaper editors'],#REDIRECT [[Clan McDuck#Gideon McDuck]]\n\n{{R...,"['Clan McDuck#Gideon McDuck', 'Category:Disney...",[],"['Disney comics characters', 'Fictional charac..."
9418,"International Tropical Timber Agreement, 1983",['International Tropical Timber Agreement'],"['Treaties of New Zealand', 'Treaties of Papua...",#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...",[],"['Environmental treaties', 'Treaties concluded..."
9419,"International Tropical Timber Agreement, 1994",['International Tropical Timber Agreement'],"['Treaties of New Zealand', 'Treaties of Papua...",#REDIRECT [[International Tropical Timber Agre...,"['International Tropical Timber Agreement', 'C...",[],"['Environmental treaties', 'Treaties concluded..."
13744,History of New Caledonia,['New Caledonia'],['National histories|New Caledonia'],#REDIRECT [[New Caledonia#History]]\n\n{{R to ...,"['New Caledonia#History', 'Category:National h...",[],['National histories|New Caledonia']
...,...,...,...,...,...,...,...
17111324,Mark Woodward (mountaineer),['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[],"['New Zealand summiters of Mount Everest', 'Ne..."
17111325,Mike Roberts (mountaineer),['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[],"['New Zealand summiters of Mount Everest', 'Ne..."
17111327,Dean Staples,['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[],"['New Zealand summiters of Mount Everest', 'Ne..."
17111328,Mark Whetu,['List of Mount Everest summiters by frequency'],"['New Zealand summiters of Mount Everest', 'Ne...",#REDIRECT [[List of Mount Everest summiters by...,['List of Mount Everest summiters by frequency...,[],"['New Zealand summiters of Mount Everest', 'Ne..."


In [19]:
df_output7_fileterd=df_output7[df_output7.Synonyms=='[]']



In [20]:
df_output7_fileterd.shape



(6637947, 7)

In [21]:
df_output7_fileterd.head(5)

Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links,Categories
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[],"['Anarchism| ', 'Anti-capitalism', 'Anti-fasci..."
17,Albedo,[],[],{{Short description|Ratio of how much light is...,['File:Albedo-e hg.svg|thumb|upright=1.3|The p...,[],"['Land surface effects on climate', 'Climate c..."
58,A,[],[],{{Short description|First letter of the Latin ...,"['Latin alpha|ɑ', 'A#Related characters|See be...",[],"['ISO basic Latin letters', 'Vowel letters']"
65,Alabama,[],[],{{Short description|U.S. state}}\n{{about|the ...,"['Northern flicker|Yellowhammer', 'Dixie', 'Au...",[],"['Alabama| ', '1819 establishments in the Unit..."
67,Achilles,[],[],{{short description|Greek mythological hero}}\...,['File:Achilles fighting against Memnon Leiden...,[],"['Greek mythological heroes', 'Kings of the My..."


In [22]:
df_output7_fileterd.iloc[0].Categories


"['Anarchism| ', 'Anti-capitalism', 'Anti-fascism', 'Economic ideologies', 'Far-left politics', 'Left-wing politics', 'Libertarian socialism', 'Libertarianism', 'Political culture', 'Political ideologies', 'Political movements', 'Social theories', 'Socialism']"

In [23]:
df_output7_fileterd.to_csv('data/wiki_output_redirect_filtered_18_04_23_v2.csv',index=False)

In [3]:
import pandas as pd
df_output7_fileterd=pd.read_csv('data/wiki_output_redirect_filtered_18_04_23_v2.csv')

In [25]:
category_of_interest=['Technology']
df_output7_fileterd[df_output7_fileterd.Categories.str.contains('|'.join(category_of_interest))]

Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links,Categories
252,Apple Inc.,[],[],{{Short description|American multinational tec...,"['Apple Park', 'Cupertino, California', 'Silic...",[],"['Apple Inc.| ', '1976 establishments in Calif..."
258,Andy Warhol,[],[],"{{Short description|American artist, film dire...","['Pittsburgh', 'Printmaking', 'Carnegie Mellon...",[],"['Andy Warhol| ', '1928 births', '1987 deaths'..."
291,Arne Kaijser,[],[],{{Short description|Swedish academic (born 195...,"['Professor', 'KTH Royal Institute of Technolo...",[],"['1950 births', 'Living people', '20th-century..."
322,Albert Speer,[],[],{{short description|Architect and Minister of ...,['Reich Minister of Armaments and War Producti...,[],"['Albert Speer| ', '1905 births', '1981 deaths..."
467,Applet,[],[],{{short description|Small software application...,"['computing', 'application (computing)|applica...",[],"['Technology neologisms', 'Component-based sof..."
...,...,...,...,...,...,...,...
17098176,Joseph J. Berry,[],[],{{Short description|American scientist}}\n{{Or...,"['National Renewable Energy Laboratory', 'Nati...",[],"['Year of birth missing (living people)', 'Liv..."
17098221,Hadas Shachnai,[],[],{{Short description|Israeli computer scientist...,"['combinatorial optimization', 'knapsack probl...",[],"['Year of birth missing (living people)', 'Liv..."
17101760,Deborah Leckband,[],[],{{Short description|American chemist and acade...,"['University of Illinois at Urbana-Champaign',...",[],"['1959 births', 'Living people', 'American wom..."
17106918,Jeyhan Kartaltepe,[],[],{{Short description|American astronomer and ac...,"['National Optical Astronomy Observatory', 'Ro...",[],"['Living people', 'People from San Antonio', '..."


In [26]:
df_output7_fileterd.iloc[0]

"['Anarchism| ', 'Anti-capitalism', 'Anti-fascism', 'Economic ideologies', 'Far-left politics', 'Left-wing politics', 'Libertarian socialism', 'Libertarianism', 'Political culture', 'Political ideologies', 'Political movements', 'Social theories', 'Socialism']"

In [28]:
df_output7_fileterd

Unnamed: 0,Article_title,Synonyms,Skills,Text,Links,External_Links,Categories
1,Anarchism,[],[],{{short description|Political philosophy and m...,"['political philosophy', 'Political movement|m...",[],"Anarchism| , Anti-capitalism, Anti-fascism, Ec..."
17,Albedo,[],[],{{Short description|Ratio of how much light is...,['File:Albedo-e hg.svg|thumb|upright=1.3|The p...,[],"Land surface effects on climate, Climate chang..."
58,A,[],[],{{Short description|First letter of the Latin ...,"['Latin alpha|ɑ', 'A#Related characters|See be...",[],"ISO basic Latin letters, Vowel letters"
65,Alabama,[],[],{{Short description|U.S. state}}\n{{about|the ...,"['Northern flicker|Yellowhammer', 'Dixie', 'Au...",[],"Alabama| , 1819 establishments in the United S..."
67,Achilles,[],[],{{short description|Greek mythological hero}}\...,['File:Achilles fighting against Memnon Leiden...,[],"Greek mythological heroes, Kings of the Myrmid..."
...,...,...,...,...,...,...,...
17112564,Ivory Coast women's national rugby union team,[],[],{{Infobox national rugby team\n| Name ...,"['Ivorian Rugby Federation', 'Ivory Coast', ""w...",[],"""African national womens rugby union teams"", R..."
17112572,Aminu Abdussalam Gwarzo,[],[],{{Short description|Nigerian politician (born ...,"['File:Comrade.jpg|thumb|', 'Abba Kabir Yusuf'...",[],"1960 births, 21st-century Nigerian politicians..."
17112573,Chahardi,[],[],"{{Short description|Village in Maharashtra, In...","['List of sovereign states|Country', 'States a...",[],Villages in Jalgaon district
17112580,Jews Against White Nationalism,[],[],{{Use mdy dates|date=April 2023}}\n{{Infobox o...,['Nonprofit organization|Nonprofit'],[],"2019 establishments in the United States, Jewi..."


In [None]:
# df=df_output7_fileterd.copy()

# # remove the square brackets and single quotes
# df.loc[:, 'Categories'] = df_output7_fileterd['Categories'].str.strip('[]').str.replace("'", "")

# # remove rows with NaN values
# df = df.dropna(subset=['Categories'])

# # split the categories into separate strings
# categories = df['Categories'].str.split(',', expand=True).values.flatten()

# # remove leading and trailing whitespace from the category names
# categories = [category.strip() for category in categories if isinstance(category, str)]

# # get unique category names
# unique_categories = list(set(categories))

# # display the unique category names
# print(unique_categories)

In [5]:
len(unique_categories)

3183769

In [6]:
type(unique_categories)

list

In [8]:
unique_categories[1:100]

['Science fiction film characters|Cody',
 '2007 crimes in Azerbaijan|Baku',
 'Australian tenors',
 '2015 in sports in Kentucky|Road to the Kentucky Oaks',
 'People from Günzburg (district)',
 'Davis Cup Asia/Oceania Zone|',
 'Fungi described in 1880',
 'FC Lokomotiv Moscow matches|Soviet Cup Final 1990',
 'People educated at St Marylebone School',
 'Kodak EasyShare cameras|Z712 IS',
 'Museums in Tver Oblast',
 'Terrorist incidents in Asia in 1959',
 '17th century in Berkshire',
 'Concepts',
 'Culture in Warwickshire',
 'Parks in Pike County',
 'Mass media in Örebro',
 'Hurricanes in Texas|Barry (1983)',
 'Malaysian companies established in 1998',
 'United States senators from Delaware',
 'Lists of Commonwealth Games medallists|Archery',
 'Wichita tribe',
 'Philippines at the Asian Games|2006',
 'Military units and formations of the United States Navy|',
 'Defunct newspapers published in Serbia',
 'Elections in Franche-Comté|',
 'Clivina|erythropyga',
 'People from Steenwijkerland',
 'S

In [9]:
# open a file in write mode
with open('data/category_list.txt', 'w') as file:
    # write the contents of the list to the file
    for item in unique_categories:
        file.write('%s\n' % item)