In [13]:
import nltk
import re
import pprint

## Accessing Text from the Web and from Disk

In [14]:
from urllib.request import urlopen

url = "https://www.gutenberg.org/cache/epub/32705/pg32705.txt"

raw = urlopen(url).read().decode('utf-8')
type(raw)

str

In [15]:
len(raw)

110015

In [16]:
raw[:75]

'\ufeffThe Project Gutenberg eBook of Deadly City\r\n    \r\nThis ebook is for the us'

In [17]:
tokens = nltk.word_tokenize(raw)
type(tokens)

list

In [18]:
len(tokens)

23472

In [19]:
tokens[:10]

['\ufeffThe',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'Deadly',
 'City',
 'This',
 'ebook',
 'is']

In [20]:
text = nltk.Text(tokens)
type(text)

nltk.text.Text

In [21]:
text[1020:1060]

['where',
 'he',
 'could',
 'look',
 'up',
 'and',
 'down',
 'a',
 'new',
 'street',
 '.',
 'There',
 'were',
 'no',
 'cars',
 '--',
 'no',
 'people',
 '.',
 'Not',
 'even',
 'a',
 'cat',
 '.',
 'A',
 'sign',
 'overhanging',
 'the',
 'sidewalk',
 'said',
 ':',
 'Restaurant',
 '.',
 'He',
 'went',
 'in',
 'under',
 'the',
 'sign',
 'and']

In [22]:
text.collocations()

Project Gutenberg™; Jim Wilson; Project Gutenberg; United States;
Literary Archive; Frank Brooks; Gutenberg™ electronic; electronic
works; Gutenberg Literary; Leroy Davis; Archive Foundation; Frank
said; young man; set forth; 've got; electronic work; n't know;
Madison Street; Gutenberg™ License; Wilson said


In [23]:
raw.find("DEADLY CITY")

873

In [24]:
raw.rfind("*** END OF THE PROJECT GUTENBERG EBOOK DEADLY CITY ***")

91186

In [25]:
raw = raw[873:91186]
raw.find("DEADLY CITY")

0

## Dealing with HTML

In [26]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
html[:60]

b'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [27]:
from bs4 import BeautifulSoup

raw = BeautifulSoup(html).get_text() # takes an HTML string and returns raw text
tokens = nltk.word_tokenize(raw)
tokens[:10]

['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", 'die', 'out', 'in']

In [28]:
tokens = tokens[96:399]
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


### Processing RSS Feed

In [29]:
import feedparser

llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']

'Language Log'

In [30]:
len(llog.entries)

13

In [31]:
post = llog.entries[2]
post.title

'Pronunciation guides fail spectacularly'

In [32]:
content = post.content[0].value
content[:70]

'<p>ICYMI:</p>\n<p></p>\n<p><span id="more-64715"></span></p>\n<p>Jonathan'

In [35]:
raw = BeautifulSoup(content).get_text()

nltk.word_tokenize(raw)[:10]

['ICYMI',
 ':',
 'Jonathan',
 'Edwards',
 ',',
 '``',
 'Mispronunciations',
 'spoil',
 'graduation',
 'at']

# Exercises

In [5]:
import re

### 1. Define a string s = 'colorless'. Write a Python statement that changes this to “colourless” using only the slice and concatenation operations.

In [7]:
s = "colorless"
s = s[:4] + 'u' + s[-5:]
s

'colourless'

### 2. We can use the slice notation to remove morphological endings on words. For example, 'dogs'[:-1] removes the last character of dogs, leaving dog. Use slice notation to remove the affixes from these words (we’ve inserted a hyphen to indi- cate the affix boundary, but omit this from your strings): dish-es, run-ning, nation- ality, un-do, pre-heat

In [30]:
words = ["dishes", "running", "nationality", "undo", "preheat"]

[w[:m.start(0)] for w in words for m in re.finditer(r"(es|ning|ality|do|heat)$", w)]

['dish', 'run', 'nation', 'un', 'pre']

### 7. Write regular expressions to match the following classes of strings:
#### a. A single determiner (assume that a, an, and the are the only determiners)
#### b. An arithmetic expression using integers, addition, and multiplication, such as 2*3+8

In [60]:
# single determiner regex 
re.match('^an?$|^the$', 'a')

<re.Match object; span=(0, 1), match='a'>

In [63]:
# arithmetic expression regex
re.match(r'\d+([\*|\+|/|-]\d+)+', '2*3+8*2')

<re.Match object; span=(0, 7), match='2*3+8*2'>

### 8. Write a utility function that takes a URL as it's argument, and returns the contents of the URL, with all HTML markup removed. Use urllib.urlopen to access the contents of the URL, e.g.:

In [95]:
from urllib.request import urlopen

def strip_html(url):
    """
    Takes a url and returns the contents with html tags removed
    """
    raw_contents = urlopen(url).read().decode('utf-8')
    
    return re.sub(r'<[\s\S]*?>', '', raw_contents)

strip_html('https://docs.python.org/3/tutorial/index.html')

'\n\n\n  \n    \n    \n\n\n\n\n\n\n\n\n\n\n\n\n    The Python Tutorial &#8212; Python 3.12.4 documentation\n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n    \n      \n    \n    \n    \n      \n    \n\n    \n    \n      @media only screen {\n        table.full-width-table {\n            width: 100%;\n        }\n      }\n    \n\n    \n            \n            \n            \n             \n            \n            \n\n  \n\n\n    \n    \n        \n            \n        \n        \n            \n                \n            \n            \n            \n                \n                    \n                \n                \n                \n            \n        \n    \n    \n        \n            \n            \n\n    Theme\n    \n        Auto\n        Light\n        Dark\n    \n\n  \n    Previous topic\n    Changelog\n  \n  \n    Next topic\n    1. Whetting Your Appetite\n  \n  \n    This Page\n    \n      Report a B

### 9. Save some text into a file corpus.txt. Define a function load(f) that reads from the file named in its sole argument, and returns a string containing the text of the file.
#### a. Use nltk.regexp_tokenize() to create a tokenizer that tokenizes the various kinds of punctuation in this text. Use one multiline regular expression inline comments, using the verbose flag (?x).
#### b. Use nltk.regexp_tokenize() to create a tokenizer that tokenizes the following kinds of expressions: monetary amounts; dates; names of people and organizations.

In [97]:
from nltk import regexp_tokenize

def load(f):
    with open(f) as file:
        content = file.read()
    return content

pattern = r"""(?x)

"""

'The Python Tutorial\nPython is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python’s elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.\n\nThe Python interpreter and the extensive standard library are freely available in source or binary form for all major platforms from the Python web site, https://www.python.org/, and may be freely distributed. The same site also contains distributions of and pointers to many free third party Python modules, programs and tools, and additional documentation.\n\nThe Python interpreter is easily extended with new functions and data types implemented in C or C++ (or other languages callable from C). Python is also suitable as an extension language for customizable applications.\n\nThis tutorial introduces the rea