In [2]:
import nltk
from nltk.probability import *
from nltk.corpus import PlaintextCorpusReader
from nltk import word_tokenize, sent_tokenize
from nltk.util import * 
from nltk.text import *
from nltk import pos_tag, pos_tag_sents
from nltk import RegexpParser


## Downloading files from WEB 2.1 LTK book

In [3]:
from urllib.request import urlopen
url = "https://www.gutenberg.org/files/1661/1661-0.txt"
raw_1 = urlopen(url).read()
type(raw_1)

bytes

In [4]:
raw= urlopen(url).read().decode("utf-8-sig")   #decoding utf-8-sig for UTF8-BOM
type(raw)

str

In [5]:
print(len(raw))
print(raw[:90])

593730
The Project Gutenberg eBook of The Adventures of Sherlock Holmes,
by Arthur Conan Doyle



## Tokenization

In [6]:
tokens = nltk.word_tokenize(raw)

In [7]:
type(tokens)

list

In [8]:
tokens[:9]

['The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'The',
 'Adventures',
 'of',
 'Sherlock']

In [9]:
mysent="Mr. Brown went to visit his couisin, but he only found her husband, their children, and their dog."

In [10]:
mysent_tokens=word_tokenize(mysent)
print(mysent_tokens, '\nThis sentence contains:', len(mysent_tokens), ' tokens')    

['Mr.', 'Brown', 'went', 'to', 'visit', 'his', 'couisin', ',', 'but', 'he', 'only', 'found', 'her', 'husband', ',', 'their', 'children', ',', 'and', 'their', 'dog', '.'] 
This sentence contains: 22  tokens


## How many words?

## Lemmatization
## Lemmatisers present all words as their lemma, that is their dictionary headword form. E.g. am, are is, be, being --> be car, cars --> car woman, women --> woman child, children -->child
## One lemmatiser in NLTK

In [11]:
wnl=nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in mysent_tokens]

['Mr.',
 'Brown',
 'went',
 'to',
 'visit',
 'his',
 'couisin',
 ',',
 'but',
 'he',
 'only',
 'found',
 'her',
 'husband',
 ',',
 'their',
 'child',
 ',',
 'and',
 'their',
 'dog',
 '.']

## PoS-tagging  pos_tag(sentence) default tagset (Penn treebank) other pos_tag(sentence,tagset="name_tagset"), e.g universal, brown

In [12]:
mysent_tagged=pos_tag(mysent_tokens)
for word,tag in mysent_tagged:
    print(word,tag)

Mr. NNP
Brown NNP
went VBD
to TO
visit VB
his PRP$
couisin NN
, ,
but CC
he PRP
only RB
found VBD
her PRP
husband NN
, ,
their PRP$
children NNS
, ,
and CC
their PRP$
dog NN
. .


## Exploring the default tagset 

In [13]:
# one specific tag
nltk.help.upenn_tagset('CC')

CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet


In [14]:
## all tags
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [15]:
# more sentences
my_text="Mr. Brown went to visit his mother, and father, but he only found their dog. His mother and father were gone to the bookshop to buy a book. It costed 2.50 $. It was expensive."
mytext_tokens=word_tokenize(my_text)
mytext_tagged=pos_tag(mytext_tokens)
print(mytext_tokens)
print(mytext_tagged)

['Mr.', 'Brown', 'went', 'to', 'visit', 'his', 'mother', ',', 'and', 'father', ',', 'but', 'he', 'only', 'found', 'their', 'dog', '.', 'His', 'mother', 'and', 'father', 'were', 'gone', 'to', 'the', 'bookshop', 'to', 'buy', 'a', 'book', '.', 'It', 'costed', '2.50', '$', '.', 'It', 'was', 'expensive', '.']
[('Mr.', 'NNP'), ('Brown', 'NNP'), ('went', 'VBD'), ('to', 'TO'), ('visit', 'VB'), ('his', 'PRP$'), ('mother', 'NN'), (',', ','), ('and', 'CC'), ('father', 'RB'), (',', ','), ('but', 'CC'), ('he', 'PRP'), ('only', 'RB'), ('found', 'VBD'), ('their', 'PRP$'), ('dog', 'NN'), ('.', '.'), ('His', 'PRP$'), ('mother', 'NN'), ('and', 'CC'), ('father', 'NN'), ('were', 'VBD'), ('gone', 'VBN'), ('to', 'TO'), ('the', 'DT'), ('bookshop', 'NN'), ('to', 'TO'), ('buy', 'VB'), ('a', 'DT'), ('book', 'NN'), ('.', '.'), ('It', 'PRP'), ('costed', 'VBD'), ('2.50', 'CD'), ('$', '$'), ('.', '.'), ('It', 'PRP'), ('was', 'VBD'), ('expensive', 'JJ'), ('.', '.')]


##  Exercise1: Identify sentences in the following text: 
TIP: use sentence tokenizer 
"Driven by intellectual creativity and critical thinking since 1479, researchers and students at the University of Copenhagen have expanded horizons and contributed to moving the world forward. With its 5,000 researchers and 37,500 students, the University boasts an international research and study environment and is highly ranked on the leading ranking lists of the world's best universities.

The University offers researchers and students the opportunity to develop their talent and launches ambitious interdisciplinary initiatives to support its strong academic communities. Through research-based teaching – and by involving them in research – students are equipped to address society's challenges and needs."

## Exercise 2: 
## a. Print each word and its corresponding PoS-tag on a line. Word and PoS-tag must be separated by a tabulator.
## b. Write a function that does the same as in the previous, but writes the words and PoS-tags to a file: "outtags.txt".

## Exercise 3: 
## Extract the tags from the tagged sentences.

## Exercise 4: 
## Read the file  "PicturesfromItaly_Rome.txt" (download from Absalon) and process it (sentence identification, tokenization and PoS-tagging).

## Exercise 5: 
## Print the tags in the file with their frequence in descending order.

## From NLTK: Various types of PoS-taggers 

In [16]:
txt1='I have not read any newspaper this morning. Is there any big news?'

## Tagger 1: all words NN why?

In [17]:
tokens = nltk.word_tokenize(txt1)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[('I', 'NN'),
 ('have', 'NN'),
 ('not', 'NN'),
 ('read', 'NN'),
 ('any', 'NN'),
 ('newspaper', 'NN'),
 ('this', 'NN'),
 ('morning', 'NN'),
 ('.', 'NN'),
 ('Is', 'NN'),
 ('there', 'NN'),
 ('any', 'NN'),
 ('big', 'NN'),
 ('news', 'NN'),
 ('?', 'NN')]

## tagger 2:  using regular expressions

In [18]:
patterns = [
    (r'.*ing$', 'VBG'),                # gerunds
    (r'.*ed$', 'VBD'),                 # simple past
    (r'.*es$', 'VBZ'),                 # 3rd singular present
    (r'.*ould$', 'MD'),                # modals
    (r'.*\'s$', 'NN$'),                # possessive nouns
    (r'.*s$', 'NNS'),                  # plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')        ]              # nouns (default)]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(tokens)

[('I', 'NN'),
 ('have', 'NN'),
 ('not', 'NN'),
 ('read', 'NN'),
 ('any', 'NN'),
 ('newspaper', 'NN'),
 ('this', 'NNS'),
 ('morning', 'VBG'),
 ('.', 'NN'),
 ('Is', 'NNS'),
 ('there', 'NN'),
 ('any', 'NN'),
 ('big', 'NN'),
 ('news', 'NNS'),
 ('?', 'NN')]

### Exercise 6:
Now we want to add a pattern for personal pronouns (PRP).
Add  the new regular expressions to the patterns list and run the resulting regexp_tagger on tokens.