In [1]:
import numpy as np
import nltk

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/pongsakorn/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/pongsakorn/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/pongsakorn/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/pongsakorn/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /home/pongsakorn/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /home/pongsakorn/nltk_data...
[nltk_data]    |   Package cess_cat is already up-

True

### Brown Corpus

In [3]:
# load the Brown Corpus
from nltk.corpus import brown

In [6]:
print('Total Categories:', len(brown.categories()))

Total Categories: 15


In [7]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [17]:
# tokenized sentences
brown.sents(categories='mystery')

[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [18]:
len(brown.fileids())

500

In [19]:
# POS tagged sentences
brown.tagged_sents(categories='mystery')

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [21]:
[' '.join(sentences) for sentences in brown.sents(categories='mystery')[:5]]

['There were thirty-eight patients on the bus the morning I left for Hanover , most of them disturbed and hallucinating .',
 'An interne , a nurse and two attendants were in charge of us .',
 "I felt lonely and depressed as I stared out the bus window at Chicago's grim , dirty West Side .",
 'It seemed incredible , as I listened to the monotonous drone of voices and smelled the fetid odors coming from the patients , that technically I was a ward of the state of Illinois , going to a hospital for the mentally ill .',
 'I suddenly thought of Mary Jane Brennan , the way her pretty eyes could flash with anger , her quiet competence , the gentleness and sweetness that lay just beneath the surface of her defenses .']

In [26]:
nouns = [(w, tag) for w, tag in brown.tagged_words(categories='mystery') if tag in {'NN', 'NP', 'NNS'}]
nouns[:10]

[('patients', 'NNS'),
 ('bus', 'NN'),
 ('morning', 'NN'),
 ('Hanover', 'NP'),
 ('interne', 'NN'),
 ('nurse', 'NN'),
 ('attendants', 'NNS'),
 ('charge', 'NN'),
 ('bus', 'NN'),
 ('window', 'NN')]

In [27]:
noun_freq = nltk.FreqDist([w for w,_ in nouns])

In [29]:
noun_freq.most_common(10)

[('man', 106),
 ('time', 82),
 ('door', 80),
 ('car', 69),
 ('room', 65),
 ('Mr.', 63),
 ('way', 61),
 ('office', 50),
 ('eyes', 48),
 ('hand', 46)]

### Reuters Corpus

In [31]:
from nltk.corpus import reuters

In [32]:
print('Total Categories:', len(reuters.categories()))

Total Categories: 90


In [34]:
reuters.categories()[:10]

['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee']

In [37]:
[' '.join(sentence) for sentence in reuters.sents(categories=['housing', 'income'])[:10]]

["YUGOSLAV ECONOMY WORSENED IN 1986 , BANK DATA SHOWS National Bank economic data for 1986 shows that Yugoslavia ' s trade deficit grew , the inflation rate rose , wages were sharply higher , the money supply expanded and the value of the dinar fell .",
 'The trade deficit for 1986 was 2 . 012 billion dlrs , 25 . 7 pct higher than in 1985 .',
 'The trend continued in the first three months of this year as exports dropped by 17 . 8 pct , in hard currency terms , to 2 . 124 billion dlrs .',
 'Yugoslavia this year started quoting trade figures in dinars based on current exchange rates , instead of dollars based on a fixed exchange rate of 264 . 53 dinars per dollar .',
 "Yugoslavia ' s balance of payments surplus with the convertible currency area fell to 245 mln dlrs in 1986 from 344 mln in 1985 .",
 'The National Bank said the drop was due to a deterioration in trade .',
 'Exports to the convertible currency area rose 11 . 6 pct from 1985 , while imports rose 17 . 8 pct .',
 "Retail pri

In [39]:
reuters.fileids(categories=['housing', 'income'])[:10]

['test/16118',
 'test/18534',
 'test/18540',
 'test/18664',
 'test/18665',
 'test/18672',
 'test/18911',
 'test/19875',
 'test/20106',
 'test/20116']

In [40]:
reuters.sents(fileids=['test/16118', 'test/18534'])

[['YUGOSLAV', 'ECONOMY', 'WORSENED', 'IN', '1986', ',', 'BANK', 'DATA', 'SHOWS', 'National', 'Bank', 'economic', 'data', 'for', '1986', 'shows', 'that', 'Yugoslavia', "'", 's', 'trade', 'deficit', 'grew', ',', 'the', 'inflation', 'rate', 'rose', ',', 'wages', 'were', 'sharply', 'higher', ',', 'the', 'money', 'supply', 'expanded', 'and', 'the', 'value', 'of', 'the', 'dinar', 'fell', '.'], ['The', 'trade', 'deficit', 'for', '1986', 'was', '2', '.', '012', 'billion', 'dlrs', ',', '25', '.', '7', 'pct', 'higher', 'than', 'in', '1985', '.'], ...]

### WordNet Corpus

In [41]:
from nltk.corpus import wordnet as wn

In [43]:
word_synsets = wn.synsets('hike')

In [44]:
word_synsets

[Synset('hike.n.01'),
 Synset('rise.n.09'),
 Synset('raise.n.01'),
 Synset('hike.v.01'),
 Synset('hike.v.02')]

In [45]:
for synset in word_synsets:
    print('Synset Name:', synset.name())
    print('POS Tag:', synset.pos())
    print('Definition:', synset.definition())
    print('Examples:', synset.examples())
    print()

Synset Name: hike.n.01
POS Tag: n
Definition: a long walk usually for exercise or pleasure
Examples: ['she enjoys a hike in her spare time']

Synset Name: rise.n.09
POS Tag: n
Definition: an increase in cost
Examples: ['they asked for a 10% rise in rates']

Synset Name: raise.n.01
POS Tag: n
Definition: the amount a salary is increased
Examples: ['he got a 3% raise', 'he got a wage hike']

Synset Name: hike.v.01
POS Tag: v
Definition: increase
Examples: ['The landlord hiked up the rents']

Synset Name: hike.v.02
POS Tag: v
Definition: walk a long way, as for pleasure or physical exercise
Examples: ['We were hiking in Colorado', 'hike the Rockies']



In [64]:
w = 'หมู'

In [65]:
wn.synsets(w, lang='tha')

[Synset('swine.n.01'),
 Synset('hog.n.03'),
 Synset('pork.n.01'),
 Synset('lamb.n.03')]

In [55]:
wn.langs()

['eng',
 'als',
 'arb',
 'bul',
 'cat',
 'cmn',
 'dan',
 'ell',
 'eus',
 'fas',
 'fin',
 'fra',
 'glg',
 'heb',
 'hrv',
 'ind',
 'ita',
 'jpn',
 'nld',
 'nno',
 'nob',
 'pol',
 'por',
 'qcn',
 'slv',
 'spa',
 'swe',
 'tha',
 'zsm']

### Working with Text

In [66]:
b'abc'

b'abc'

In [75]:
s = '''hello I'm 
a multi-line'''
s

"hello I'm \na multi-line"

In [73]:
s = "C:\the_folder\new_dir\file.txt"
print(s)

C:	he_folder
ew_dirile.txt


In [72]:
s = r"C:\the_folder\new_dir\file.txt"
print(s)

C:\the_folder\new_dir\file.txt


In [76]:
'hello' 'wirkd'

'hellowirkd'

In [77]:
import re

In [78]:
re.IGNORECASE

2

In [79]:
re.DOTALL

16

In [86]:
re.compile('a', flags=re.IGNORECASE)

re.compile(r'a', re.IGNORECASE|re.UNICODE)

In [92]:
s = 'Hèllo'
s2 = 'สวัสดี'

In [97]:
re.findall(r'\w+', s2)

['สว', 'สด']

In [98]:
pat = 'python'
s1 = 'Python is an excellent language'
s2 = 'I love the Python language. I also use Python to build applications at work'

In [101]:
re.match(pat, s1, flags=re.IGNORECASE)

<_sre.SRE_Match object; span=(0, 6), match='Python'>

In [102]:
m = re.match(pat, s1, flags=re.IGNORECASE)

In [109]:
print("Found match `{}` ranging from index {} - {} in the string\n`{}`".format(m.group(), m.start(), m.end(), s1))

Found match `Python` ranging from index 0 - 6 in the string
`Python is an excellent language`


In [112]:
re.match(pat, s2, flags=re.IGNORECASE)

In [113]:
re.search(pat, s2, flags=re.IGNORECASE)

<_sre.SRE_Match object; span=(11, 17), match='Python'>

In [114]:
re.findall(pat, s2, flags=re.IGNORECASE)

['Python', 'Python']

In [116]:
re.sub(pat, 'Java', s1, flags=re.IGNORECASE)

'Java is an excellent language'

In [118]:
re.subn(pat, 'Java', s2, flags=re.IGNORECASE)

('I love the Java language. I also use Java to build applications at work', 2)

In [None]:
re.subn()