# Tokenizing

In [2]:
# Tokenization is the process of breaking up a string or a document of text info a wordless structure

In [2]:
import nltk

In [43]:
# nltk.download('all')

## Tokenizing by words

In [8]:
my_string = "I am learning Natural Language Processing."

In [9]:
tokens = nltk.word_tokenize(my_string)

In [10]:
tokens

['I', 'am', 'learning', 'Natural', 'Language', 'Processing', '.']

In [11]:
len(tokens)

7

## Tokenizing by sentence

In [12]:
phrase = "I am learning Natural Language Processing. I am leaning how to tokenize!"

In [13]:
tokens_sent = nltk.sent_tokenize(phrase)

In [14]:
tokens_sent

['I am learning Natural Language Processing.', 'I am leaning how to tokenize!']

In [15]:
len(tokens_sent)

2

### We can also tokenize the sentences 

In [18]:
for item in tokens_sent:
    print(nltk.word_tokenize(item))

['I', 'am', 'learning', 'Natural', 'Language', 'Processing', '.']
['I', 'am', 'leaning', 'how', 'to', 'tokenize', '!']


# Normalising

Clean our text data to make it more uniform

In [3]:
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

In [5]:
md[:22]

['[',
 'Moby',
 'Dick',
 'by',
 'Herman',
 'Melville',
 '1851',
 ']',
 'ETYMOLOGY',
 '.',
 '(',
 'Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar',
 'School',
 ')']

In [6]:
md_22 = md[:22]

In [7]:
md_22 

['[',
 'Moby',
 'Dick',
 'by',
 'Herman',
 'Melville',
 '1851',
 ']',
 'ETYMOLOGY',
 '.',
 '(',
 'Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar',
 'School',
 ')']

Now we want to retrieve only the words in this array, so we'll use the function ISALPHA() that returns true if the argument is a word (composed of letter from the alphabet) or False otherwise (dates, numbers, punctuation marks, etc)

In [8]:
for word in md_22:
    if word.isalpha():
        print(word)

Moby
Dick
by
Herman
Melville
ETYMOLOGY
Supplied
by
a
Late
Consumptive
Usher
to
a
Grammar
School


To save the words in lower case:

In [9]:
norm = [word.lower() for word in md_22 if word.isalpha() ]

In [10]:
norm

['moby',
 'dick',
 'by',
 'herman',
 'melville',
 'etymology',
 'supplied',
 'by',
 'a',
 'late',
 'consumptive',
 'usher',
 'to',
 'a',
 'grammar',
 'school']

# Stemmers

Used to remove affices, when dealing with plurals, for instance cat-> cats or city -> cities
We might or might not be interested in these plurals so we just work with stemmers. But this isn't an exact science so we must be aware that we could incur in errors 

### Porter stemmer

# porter = nltk.PorterStemmer()

In [12]:
my_list = ["cat","cats","lie","lying","run","running","city","cities","month","monthly","woman","women"]

In [13]:
for word in my_list: 
    print(porter.stem(word))

cat
cat
lie
lie
run
run
citi
citi
month
monthli
woman
women


### Lancaster stemmer 

In [14]:
lancaster = nltk.LancasterStemmer()

In [15]:
for word in my_list:
    print(lancaster.stem(word))

cat
cat
lie
lying
run
run
city
city
mon
month
wom
wom


## Lemmatization with WordNet

In [19]:
wnlem = nltk.WordNetLemmatizer()

In [22]:
for word in my_list:
    print(wnlem.lemmatize(word))

cat
cat
lie
lying
run
running
city
city
month
monthly
woman
woman


# Part of speech

In [23]:
text = "I walked to the cafe to buy coffee before work"

In [24]:
tokens = nltk.word_tokenize(text)

In [27]:
nltk.pos_tag(tokens)

[('I', 'PRP'),
 ('walked', 'VBD'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('cafe', 'NN'),
 ('to', 'TO'),
 ('buy', 'VB'),
 ('coffee', 'NN'),
 ('before', 'IN'),
 ('work', 'NN')]

In [30]:
# To get the meaning of the abbreviations 
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

Comparing 2 uses of the same word with the tagger

In [32]:
nltk.pos_tag(nltk.word_tokenize("I will have desert."))

[('I', 'PRP'), ('will', 'MD'), ('have', 'VB'), ('desert', 'NN'), ('.', '.')]

In [33]:
nltk.pos_tag(nltk.word_tokenize("They will desert us."))

[('They', 'PRP'), ('will', 'MD'), ('desert', 'VB'), ('us', 'PRP'), ('.', '.')]

### The most common nous in Moby Dick

In [34]:
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

In [35]:
# Normalise the text
md_norm = [word.lower() for word in md if word.isalpha()]

In [36]:
md_tags = nltk.pos_tag(md_norm, tagset='universal')

In [37]:
md_tags[:5]

[('moby', 'NOUN'),
 ('dick', 'NOUN'),
 ('by', 'ADP'),
 ('herman', 'NOUN'),
 ('melville', 'NOUN')]

In [38]:
# Return only the nouns
md_nouns = [word[0] for word in md_tags if word[1] == 'NOUN']

In [39]:
md_nouns[:10]

['moby',
 'dick',
 'herman',
 'melville',
 'etymology',
 'consumptive',
 'usher',
 'grammar',
 'school',
 'pale']

In [40]:
# Use Frequency Distribution to obtain the top 10 most frequent nouns in Moby Dick
nouns_fd = nltk.FreqDist(md_nouns)

In [42]:
nouns_fd.most_common(10)

[('i', 1182),
 ('whale', 909),
 ('s', 774),
 ('man', 527),
 ('ship', 498),
 ('sea', 435),
 ('head', 337),
 ('time', 334),
 ('boat', 332),
 ('ahab', 278)]

These most frequent nouns give us an indication of the mos representative words in the book, which would be of our interest eventually