In [None]:
import pandas as pd
import numpy as np
import wikipediaapi
import re

In [90]:
import wikipediaapi

wiki = wikipediaapi.Wikipedia(
    user_agent='Tyler/DSAN5400_project',
    language='en'
)

page = wiki.page('List of last words')

if page.exists():

    for section in page.sections:
        print(f"Section: {section.title}")
        print(f"Level: {section.level}")  
        
        for subsection in section.sections:
            print(f"  Subsection: {subsection.title}")

Section: Chronological list of last words
Level: 1
  Subsection: Pre-5th century
  Subsection: 5th to 14th centuries
  Subsection: 15th century
  Subsection: 16th century
  Subsection: 17th century
  Subsection: 18th century
  Subsection: 19th century
  Subsection: 20th century
  Subsection: 21st century
Section: Ironic last words
Level: 1
Section: Independently notable last words
Level: 1
Section: See also
Level: 1
Section: Notes
Level: 1
Section: References
Level: 1
Section: Further reading
Level: 1
Section: External links
Level: 1


# 21st Century Last Words Scraping

In [91]:
page = wiki.page('List of last words (21st century)')

if page.exists():

    for section in page.sections:
        print(f"Section: {section.title}")
        print(f"Level: {section.level}")  
        print("-" * 50)
        
        for subsection in section.sections:
            print(f"  Subsection: {subsection.title}")



Section: 2001–2009
Level: 1
--------------------------------------------------
Section: 2010–2019
Level: 1
--------------------------------------------------
Section: 2020–2025
Level: 1
--------------------------------------------------
Section: Notes
Level: 1
--------------------------------------------------
Section: References
Level: 1
--------------------------------------------------
Section: External links
Level: 1
--------------------------------------------------


In [92]:
text = ""
page = wiki.page('List of last words (21st century)')
if page.exists():

    for section in page.sections[:3]:  
        text += section.text
        

In [131]:
def parse_text(text):
    entries = []

    def parse_entry(entry_text):
        lines = [l.strip() for l in entry_text.split('\n') if l.strip()]

        if len(lines) < 2:
            return None
        
        rest = ' '.join(lines[1:]).lstrip('—-–').strip()
        date = re.search(r'\(([^)]*(?:\d{4}|c\.\s*\d+|(?:\d+th|c\.\s*\d+)\s*century)(?:\s*(?:BC|AD))?[^)]*)\)', rest)
        name = rest.split(',')[0]

        if date:
            before_date = rest[:date.start()].strip()
            title = before_date.split(',', 1)[1].strip() if ',' in before_date else ""
            context = rest[date.end():].strip(', .')
        else:
            title = rest.split(',', 1)[1].strip() if ',' in rest else ""
            context = ""
        
        return {
            'name': name,
            'title': title,
            'quote': lines[0].strip('"''"'),
            'date': date.group(1) if date else "",
            'context': context
        }

    blocks = re.split(r'\n(?=[""""])', text)

    for block in blocks:
        if block.strip():
            parsed = parse_entry(block)
            if parsed:
                entries.append(parsed)

    return pd.DataFrame(entries)

df = parse_text(text)

In [94]:
df.shape

(224, 5)

In [95]:
df.to_csv('data/raw_data/last_words_21st_century.csv', index=False)

# 20th

In [96]:
page = wiki.page('List of last words (20th century)')

if page.exists():

    for section in page.sections:
        print(f"Section: {section.title}")
        print(f"Level: {section.level}")  
        print("-" * 50)
        
        for subsection in section.sections:
            print(f"  Subsection: {subsection.title}")

Section: 1901–1909
Level: 1
--------------------------------------------------
Section: 1910–1919
Level: 1
--------------------------------------------------
Section: 1920–1929
Level: 1
--------------------------------------------------
Section: 1930–1939
Level: 1
--------------------------------------------------
Section: 1940–1949
Level: 1
--------------------------------------------------
Section: 1950–1959
Level: 1
--------------------------------------------------
Section: 1960–1969
Level: 1
--------------------------------------------------
Section: 1970–1979
Level: 1
--------------------------------------------------
Section: 1980–1989
Level: 1
--------------------------------------------------
Section: 1990–2000
Level: 1
--------------------------------------------------
Section: Notes
Level: 1
--------------------------------------------------
Section: References
Level: 1
--------------------------------------------------
Section: External links
Level: 1
----------------------

In [97]:
text_20 = ""
page = wiki.page('List of last words (20th century)')
if page.exists():

    for section in page.sections[:10]:  
        text_20 += section.text
        

In [98]:
print(len(text_20))

146577


In [99]:


df = parse_text(text_20)

In [100]:
df.shape

(728, 5)

In [101]:
df.to_csv('data/raw_data/last_words_20th_century.csv', index=False)

# 19th

In [102]:
page = wiki.page('List of last words (19th century)')

if page.exists():

    for section in page.sections:
        print(f"Section: {section.title}")
        print(f"Level: {section.level}")  
        print("-" * 50)
        
        for subsection in section.sections:
            print(f"  Subsection: {subsection.title}")

Section: 1801–1809
Level: 1
--------------------------------------------------
Section: 1810–1819
Level: 1
--------------------------------------------------
Section: 1820–1829
Level: 1
--------------------------------------------------
Section: 1830–1839
Level: 1
--------------------------------------------------
Section: 1840–1849
Level: 1
--------------------------------------------------
Section: 1850–1859
Level: 1
--------------------------------------------------
Section: 1860–1869
Level: 1
--------------------------------------------------
Section: 1870–1879
Level: 1
--------------------------------------------------
Section: 1880–1889
Level: 1
--------------------------------------------------
Section: 1890–1900
Level: 1
--------------------------------------------------
Section: Notes
Level: 1
--------------------------------------------------
Section: References
Level: 1
--------------------------------------------------
Section: External links
Level: 1
----------------------

In [103]:
text_19 = ""
page = wiki.page('List of last words (19th century)')
if page.exists():

    for section in page.sections[:10]:  
        text_19 += section.text

df = parse_text(text_19)

df.to_csv('data/raw_data/last_words_19th_century.csv', index=False)


# 18th

In [104]:
page = wiki.page('List of last words (18th century)')

if page.exists():

    for section in page.sections:
        print(f"Section: {section.title}")
        print(f"Level: {section.level}")  
        print("-" * 50)
        
        for subsection in section.sections:
            print(f"  Subsection: {subsection.title}")

Section: 1701–1709
Level: 1
--------------------------------------------------
Section: 1710–1719
Level: 1
--------------------------------------------------
Section: 1720–1729
Level: 1
--------------------------------------------------
Section: 1730–1739
Level: 1
--------------------------------------------------
Section: 1740–1749
Level: 1
--------------------------------------------------
Section: 1750–1759
Level: 1
--------------------------------------------------
Section: 1760–1769
Level: 1
--------------------------------------------------
Section: 1770–1779
Level: 1
--------------------------------------------------
Section: 1780–1789
Level: 1
--------------------------------------------------
Section: 1790–1800
Level: 1
--------------------------------------------------
Section: Notes
Level: 1
--------------------------------------------------
Section: References
Level: 1
--------------------------------------------------
Section: External links
Level: 1
----------------------

In [105]:
text_18 = ""
page = wiki.page('List of last words (18th century)')
if page.exists():

    for section in page.sections[:10]:  
        text_18 += section.text

df = parse_text(text_18)

df.to_csv('data/raw_data/last_words_18th_century.csv', index=False)


# Other

In [132]:

page = wiki.page('List of last words')

if page.exists():
    chronological_section = page.sections[0]
    text_pre5_to_17 = ""
    for subsection in chronological_section.sections[:5]:
        text_pre5_to_17 += subsection.text
    
    text_ironic = page.sections[1].text
    text_notable = page.sections[2].text

In [133]:
df_pre5_to_17 = parse_text(text_pre5_to_17)
df_ironic = parse_text(text_ironic)
df_notable = parse_text(text_notable)

In [136]:
df_notable.tail()

Unnamed: 0,name,title,quote,date,context
2,Jesus,founder of Christianity,"It is finished."" (τετέλεσται.)",c. 33 AD,right before his death by crucifixion
3,Joseph Trumpeldor,Jewish Zionist activist,"Never mind, it is good to die for our country....",1 March 1920,after being mortally wounded at the Battle of ...
4,Stjepan Filipović,Yugoslav communist,"Death to fascism! Freedom to the people!"" (Smr...",22 May 1942,seconds before execution by hanging
5,Todd Beamer,American passenger on United Airlines Flight 93,Are you guys ready? Let's roll.,11 September 2001,signaling the start of the revolt against the ...
6,Eric Garner,American former horticulturist,I can't breathe.,17 July 2014,after being put in a chokehold by an arresting...


In [137]:
df_pre5_to_17.to_csv('data/raw_data/last_words_pre5_to_17_century.csv', index=False)
df_ironic.to_csv('data/raw_data/last_words_ironic.csv', index=False)
df_notable.to_csv('data/raw_data/last_words_notable.csv', index=False)