In [1]:
#!pip install beautifulsoup
#!wget https://dumps.wikimedia.org/enwiki/20190420/enwiki-20190420-pages-articles-multistream1.xml-p10p30302.bz2
#!bunzip2 enwiki-20190420-pages-articles-multistream1.xml-p10p30302.bz2    

In [2]:
# Count lines, words, characters in XML data file
!wc enwiki-20190420-pages-articles-multistream1.xml-p10p30302

 4586335 74563548 642819893 enwiki-20190420-pages-articles-multistream1.xml-p10p30302


In [1]:
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def get_pages_from_xml_file(filename, start_tag='<page>', end_tag='</page>'):
    """Yields each page from the specified XML data file."""
    page = None
    with open(filename) as f:
        for line in f:
            if start_tag in line:
                page = []
                page.append(line)
            elif end_tag in line:
                page.append(line)
                page_xml = ''.join(page)
                yield page_xml
                page = None
            else:
                if page is not None:
                    page.append(line)

In [3]:
filename = 'enwiki-20190420-pages-articles-multistream1.xml-p10p30302'
pages = list(get_pages_from_xml_file(filename))

In [4]:
def get_title_from_page_xml(page):
    """Returns the title of the given page."""
    soup = BeautifulSoup(page)
    return soup.select_one('title').text

In [5]:
len(pages)  # Show the number of pages

19822

In [6]:
titles = [get_title_from_page_xml(page) for page in pages[:200]]

In [7]:
titles[:5] + ['...'] + titles[-5:]  # Show the first and last 5 titles

['AccessibleComputing',
 'Anarchism',
 'AfghanistanHistory',
 'AfghanistanGeography',
 'AfghanistanPeople',
 '...',
 'Albania/Transnational Issues',
 'Albania/People',
 'Albania/Foreign relations',
 'Agnostida',
 'Abortion']

In [35]:
print(pages[0])  # Print the first page

  <page>
    <title>AccessibleComputing</title>
    <ns>0</ns>
    <id>10</id>
    <redirect title="Computer accessibility" />
    <revision>
      <id>854851586</id>
      <parentid>834079434</parentid>
      <timestamp>2018-08-14T06:47:24Z</timestamp>
      <contributor>
        <username>Godsy</username>
        <id>23257138</id>
      </contributor>
      <comment>remove from category for seeking instructions on rcats</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">#REDIRECT [[Computer accessibility]]

{{R from move}}
{{R from CamelCase}}
{{R unprintworthy}}</text>
      <sha1>42l0cvblwtb4nnupxm6wo000d27t6kf</sha1>
    </revision>
  </page>



In [9]:
def get_text_from_page_xml(page):
    """Returns the text of the given page."""
    soup = BeautifulSoup(page)
    return soup.select_one('text').text

In [26]:
page = pages[0]
soup = BeautifulSoup(page)
text = get_text_from_page_xml(page)
print(text), type(text)

#REDIRECT [[Computer accessibility]]

{{R from move}}
{{R from CamelCase}}
{{R unprintworthy}}


(None, str)

In [24]:
data = []
for page in pages[:1000]:
    row = {
        'title': get_title_from_page_xml(page),
        'text': get_text_from_page_xml(page),
    }
    data.append(row)

In [25]:
df = pd.DataFrame(data)
df['text_length'] = df['text'].apply(lambda x: len(x))
df = df.sort_values('text_length', ascending=False)
df.head()

Unnamed: 0,text,title,text_length
254,{{redirect|Apple (company)|other companies of ...,Apple Inc.,309208
439,{{Redirect|AI|other uses|AI (disambiguation)|a...,Artificial intelligence,252857
185,{{About|the country}}\n{{pp-sock|small=yes}}\n...,Albania,243520
203,{{Use American English|date=February 2019}}\n{...,American Revolutionary War,239938
395,{{For|total or partial opposition to [[Judaism...,Antisemitism,233417


In [29]:
# Checking out my longest page
print(df['text'][254])

{{redirect|Apple (company)|other companies of similar name|Apple (disambiguation) #Brands and enterprises}}
{{pp-semi-indef}}
{{short description|Technology company; developer of consumer electronics and multimedia platforms}}
{{Use American English|date=April 2015}}
{{Use mdy dates|date=February 2019}}
{{coord|37.33182|-122.03118|region:US-CA|display=title}}
{{Infobox company
| name = Apple Inc.
| logo = Apple logo black.svg
| logo_size = 80px
| image = Aerial view of Apple Park dllu.jpg
| image_size = 260px
| image_caption = [[Apple Park]] in [[Cupertino, California]], April 2018
| former_name = {{Unbulleted list|Apple Computer Company|(1976–1977)|Apple Computer, Inc.|(1977–2007)}}
| type = [[Public company|Public]]
| traded_as = {{plainlist|
* {{NASDAQ|AAPL}}
* [[NASDAQ-100|NASDAQ-100 component]]
* [[Dow Jones Industrial Average|DJIA component]]
* [[S&P 100|S&P 100 component]]
* [[S&P 500 Index|S&P 500 component]]
}}
| ISIN = US0378331005
| industry = {{plainlist|
* [[Computer hardw

### Challenge

#### 1. Create a Pandas dataframe containing the title and text of each page.

* Implement the `get_text_from_page_xml` function above.
* Re-create the dataframe with the text field filled in.

#### 2. Identify the five pages that have the _longest_ text.

* Find the length of each page's `<text>...</text>` element and add it to your dataframe.
* Sort the data frame by text length, descending.
* What are the titles of the five longest articles?

## Notes:
* I was able to get it to run by leaving off the 'lxml' modifier from BeautifulSoup.
* I cut it down to less pages for the purpose of testing to make sure things work. 