# <font color='red'>Bird Book</font>
## <font color='blue'>Chapter 1 : Language Processing and Python</font>

-----

### <font color='green'>1. Text and Words</font>
#### 1.1 Searching Text

In [None]:
from nltk.book import *

In [None]:
# Context of where the word occures
text5.concordance("Lol")  

In [None]:
# other words in the similar contexts
text5.similar("Lol")  

In [None]:
# common contexts
text5.common_contexts(["Lol","hi"])  

In [None]:
# plot of positional occurance of words 
text4.dispersion_plot(["god", "citizens"])  

#### 1.2 Counting Vocabulary

In [None]:
# remove duplicate words (dictionary or type of words)
set(text3)  

In [None]:
# show sorted
sorted(set(text3))  

In [None]:
def lexical_diversity(text):
    return len(set(text))/len(text)

In [None]:
text5.count("Yo")

### <font color='green'>2. A closer look at Python</font>
#### 2.1 Lists

In [None]:
ex = ["im", "here", "boy"]
# count specified element of list
ex.count("im") 

In [None]:
# Concatenate two lists
["hey", "you"] + ["are", "good"]  

In [None]:
ex.append("nice")

#### 2.2 Indexing Lists

In [None]:
text4[173]

In [None]:
# first index of occurance of the word
text4.index("god")  

In [None]:
# slicing (in this case gets items from position -100 to last item)
text5[-100:]  

In [None]:
# modify group of items
ex[1:3] = ["second", "third"]  

#### 2.3 Strings

In [None]:
name = "Mahdi"
# index a string
name[2]

In [None]:
# slice a string
name[:4]  

In [None]:
# multiply a string
name * 2  

In [None]:
# concatenate a string
name + "!"  

In [None]:
' '.join(["hello", "world"])

In [None]:
"My name is Mahdi".split()

### <font color='green'>3. Computing with language</font>
#### 3.1 Frequency Distribution

In [None]:
# nltk's built-in frequency distribution
fdist1 = FreqDist(text1)  

In [None]:
fdist1.most_common(50)

In [None]:
# FreqDist format: [('Hello', 34),('Mahdi', 1344),('Ali', 123), ...]

In [None]:
# get frequency of word
fdist1["shame"]  

In [None]:
# cumulative frequency plot
fdist1.plot(50, cumulative=True)  

In [None]:
# words that occures only once
fdist1.hapaxes()  

#### 3.2 Fine-grained Selection of Words

In [None]:
# list comprehension concept

In [None]:
long_words = [w for w in set(text4) if len(w) > 15]

In [None]:
informative = [w for w in set(text5) if len(w) > 7 and FreqDist(text5)[w] > 7 ]

#### 3.3 Collocations and Bigrams

In [None]:
# A collocation is a sequence of words that occur together unusually often like -> united states, persian gulf
# Bigrams are just pair of words

In [None]:
from nltk import bigrams
list(bigrams(['more', 'is', 'said', 'than', 'done']))

In [None]:
# bigram format: [("my", "name"), ("name", "is"), ("is", "Mahdi"), ...]

In [None]:
text5.collocations()

#### 3.4 Counting Other Things

In [None]:
fdist = FreqDist(len(w) for w in text3)

In [None]:
# most frequent item
fdist.max() 

In [None]:
# frequency of 3 in our FreqDist
fdist.freq(3) 

In [None]:
# total number of samples
fdist.N() 

In [None]:
# tabulate the frequency distribution
fdist.tabulate()  

In [None]:
# update fdist1 with counts from fdist2
fdist1 |= fdist2  

In [None]:
# test if samples in fdist1 occur less frequently than in fdist2
fdist1 < fdist2

### <font color='green'>4 Python decision and control</font>
#### 4.1 Conditions

In [None]:
# word comparison operators
s.startswith(t) # test if s starts with t
s.endswith(t)   # test if s ends with t
t in s          # test if t is a substring of s
s.islower()     # test if s contains cased characters and all are lowercase
s.isupper()     # test if s contains cased characters and all are uppercase
s.isalpha()     # test if s is non-empty and all characters in s are alphabetic
s.isalnum()     # test if s is non-empty and all characters in s are alphanumeric
s.isdigit()     # test if s is non-empty and all characters in s are digits
s.istitle()     # test if s contains cased characters and is titlecased (i.e. all words in s have initial capitals)

------

## <font color='blue'>Chapter 2 : Accessing Text Corpora and Lexical Resources</font>

-----

### <font color='green'>1. Accessing Text Corpora</font>
#### 1.1 Gutenberg Corpus

In [None]:
# 25,000 free electronic books
from nltk.corpus import gutenberg

In [None]:
 # file ids of this corpus
gutenberg.fileids()

In [None]:
# words of this fileid
gutenberg.words('austen-emma.txt')

In [None]:
# using prior tasks of chapter 1 on nltk's corpus
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")
emma.similar("surprize")

In [None]:
# raw text of this fileid
gutenberg.raw('austen-emma.txt')

In [None]:
# sentences of this fileid
gutenberg.sents('austen-emma.txt')

In [None]:
fileid = 'austen-emma.txt'
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
# average word length
avg_word_len = num_chars / num_words
# average sentence length
avg_sent_len = num_words / num_sents
lexical_diversity = num_words / num_vocab

#### 1.2 Web and Chat Text

In [None]:
"""
Firefox discussion forum, conversations overheard in New York,
the movie script of Pirates of the Carribean, personal advertisements, and wine reviews
"""
from nltk.corpus import webtext

In [None]:
# instant messaging chat sessions
from nltk.corpus import nps_chat

In [None]:
# chatroom of 20's which contain 706 posts collected on 10-19-2006
nps_chat.posts("10-19-20s_706posts.xml")

#### 1.3 Brown Corpus

In [None]:
# this corpus contains categorized text
from nltk.corpus import brown

In [None]:
# get categories
brown.categories()

In [None]:
# get words by category
brown.words(categories='news')
# get words by fileid
brown.words(fileids=['cg22'])

In [None]:
# get sentences by category
brown.sents(categories=['news', 'editorial', 'reviews'])

In [None]:
# count wh modals within each category
from nltk import ConditionalFreqDist
cfd = ConditionalFreqDist(
           (genre, word)
           for genre in brown.categories()
           for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

#### 1.4 Reuters Corpus

In [None]:
"""
10,788 news documents totaling 1.3 million words. 
The documents have been classified into 90 topics, and grouped into two sets, called "training" and "test"
Unlike the Brown Corpus, categories in the Reuters corpus overlap with each other, 
simply because a news story often covers multiple topics
"""
from nltk.corpus import reuters

In [None]:
# get fileid
reuters.fileids()
reuters.fileids('barley')
reuters.fileids(['barley', 'corn'])

In [None]:
# get category
reuters.categories()
reuters.categories('training/9865')
reuters.categories(['training/9865', 'training/9880'])

In [None]:
# get words
reuters.words('training/9865')[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories='barley')
reuters.words(categories=['barley', 'corn'])

#### 1.5 Inaugural Address Corpus

In [None]:
"""
the corpus is actually a collection of 55 texts, one for each presidential address.
An interesting property of this collection is its time dimension
"""
from nltk.corpus import inaugural

#### 1.7 Corpora in Other Languages

In [None]:
# Spanish corpus
from nltk.corpus import cess_esp
# indian cropus
from nltk.corpus import indian
# Universal Declaration of Human Rights
from nltk.corpus import udhr

In [None]:
uhdr.fileids()

In [None]:
udhr.words('Farsi_Persian-UTF8')

#### 1.8 Text Corpus Structure

In [None]:
# basic corpus functionality
fileids()  # the files of the corpus
fileids([categories])  # the files of the corpus corresponding to these categories
categories()  # the categories of the corpus
categories([fileids])  # the categories of the corpus corresponding to these files
raw()  # the raw content of the corpus
raw(fileids=[f1,f2,f3])  # the raw content of the specified files
raw(categories=[c1,c2])  # the raw content of the specified categories
words()  # the words of the whole corpus
words(fileids=[f1,f2,f3])  # the words of the specified fileids
words(categories=[c1,c2])  # the words of the specified categories
sents()  # the sentences of the whole corpus
sents(fileids=[f1,f2,f3])  # the sentences of the specified fileids
sents(categories=[c1,c2])  # the sentences of the specified categories
abspath(fileid)  # the location of the given file on disk
encoding(fileid)  # the encoding of the file (if known)
open(fileid)  # open a stream for reading the given corpus file
root  # if the path to the root of locally installed corpus
readme()  # the contents of the README file of the corpus

#### 1.9 Loading your own Corpus

In [None]:
from nltk.corpus import PlaintextCorpusReader

In [None]:
corpus_root = "C:\\Users\\Mahdi\\Desktop"
wordlists = PlaintextCorpusReader(corpus_root, '[\w]*\.txt')
wordlists.fileids()

### <font color='green'>2. Conditional Frequency Distributions</font>

In [None]:
# A conditional frequency distribution is a collection of frequency distributions, each one for a different "condition"

#### 2.1 Conditions and Events

In [None]:
# we need to process pair of events in Conditional Frequency Distribution
[('news', 'The'), ('news', 'Fulton'), ('romance', 'County'), ...]

In [None]:
# Each pair has the form (condition, event)

#### 2.2 Counting Words by Genre

In [None]:
# get all conditions
cfd.conditions()

In [None]:
# get specified condition
cfd['news']

In [None]:
# most common in conditional frequency distribution
cfd['romance'].most_common(20)

In [None]:
# get specified word of specified condition
cfd['romance']['could']

#### 2.3 Plotting and Tabulating Distributions

In [None]:
cfd.tabulate(conditions=[], samples=[], cumulative=True/False)

In [None]:
cfd.plot()

#### 2.4 Generating Random Text with Bigrams

In [None]:
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()

text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

In [None]:
generate_model(cfd, 'living')

In [None]:
# Conditional Frequency Distribution methods
cfdist = ConditionalFreqDist(pairs)  # create a conditional frequency distribution from a list of pairs
cfdist.conditions()  # the conditions
cfdist[condition]  # the frequency distribution for this condition
cfdist[condition][sample]  # frequency for the given sample for this condition
cfdist.tabulate()  # tabulate the conditional frequency distribution
cfdist.tabulate(samples, conditions)  # tabulation limited to the specified samples and conditions
cfdist.plot()  # graphical plot of the conditional frequency distribution
cfdist.plot(samples, conditions)  # graphical plot limited to the specified samples and conditions
cfdist1 < cfdist2  # test if samples in cfdist1 occur less frequently than in cfdist2

### <font color='green'>4. Lexical Resources</font>

In [None]:
"""
A lexicon, or lexical resource, is a collection of words and/or phrases
along with associated information such as part of speech and sense definitions
A lexical entry consists of a headword (also known as a lemma) 
along with additional information such as the part of speech and the sense definition
"""

In [None]:
# example of lexicon : FreqDist, Dictionary, cfd, ...

In [None]:
# Two distinct words having the same spelling are called homonyms

#### 4.1 Wordlist Corpora

In [None]:
# The Words Corpus is used by some spell checkers
from nltk.corpus import words

In [None]:
"""
stopwords, that is, high-frequency words like the, to and also
that we sometimes want to filter out of a document before further processing
"""
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
# unusual words are those which are not in dictionary
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    eng_vocab = set(w.lower() for w in words.words())
    unusual = text_vocab - eng_vocab
    return sorted(unusual)

In [None]:
# 8,000 first names categorized by gender
from nltk.corpus import names

#### 4.2 Pronouncing Dictionary

In [None]:
# CMU Pronouncing Dictionary for US English, which was designed for use by speech synthesizers
from nltk.corpus import cmudict

In [None]:
# format of output [('word',['PH1', 'PH2', 'PH3', ...]), (), (), ...]
cmudict.entries()

In [None]:
# dictionary of words
cmudict.dict()
# text-to-speech
cmudict.dict()['fire']

#### 4.3 Comparative Wordlists

In [None]:
"""
NLTK includes so-called Swadesh wordlists, lists of about 200 common words in several languages.
The languages are identified using an ISO 639 two-letter code
"""
from nltk.corpus import swadesh

In [None]:
# get file ids (output two-letter ISO code for each lang)
swadesh.fileids()

In [None]:
# get english words in the corpus
swadesh.words('en')

In [None]:
# French to English translator
fr2en = swadesh.entries(['fr', 'en'])
translate = dict(fr2en)
translate['chien']

#### 4.4 Shoebox and Toolbox Lexicons

In [None]:
"""
A Toolbox file consists of a collection of entries, where each entry is made up of one or more fields. 
Most fields are optional or repeatable, 
which means that this kind of lexical resource cannot be treated as a table or spreadsheet
"""
from nltk.corpus import toolbox

In [None]:
# a dictionary for the Rotokas language
toolbox.entries('rotokas.dic')

### <font color='green'>5. WordNet</font>

In [None]:
"""
WordNet is a semantically-oriented dictionary of English,
similar to a traditional thesaurus but with a richer structure
""" 
from nltk.corpus import wordnet

#### 5.1 Senses and Synonyms

In [None]:
# get synonym set of a word as Synset object
wordnet.synsets('automobile')

In [None]:
# get a synset with specific POS
wordnet.synsets('good', wordnet.NOUN)

In [None]:
# a collection of synonomous words of this synset called lemma
wordnet.synset('car.n.01').lemma_names()

In [None]:
wordnet.synset('car.n.01').definition()

In [None]:
wordnet.synset('car.n.01').examples()

In [None]:
# get all the lemmas for a given synset
wordnet.synset('car.n.01').lemmas()

In [None]:
# look up a particular lemma
wordnet.lemma('car.n.01.automobile')

In [None]:
# get the synset corresponding to a lemma
wordnet.lemma('car.n.01.automobile').synset()

In [None]:
# get the name of a lemma
wordnet.lemma('car.n.01.automobile').name()

In [None]:
# get all the lemmas of specified word
wordnet.lemmas('car')

#### 5.2 The WordNet Hierarchy

In [None]:
# get child nodes in synset hierarchy
wordnet.synset('car.n.01').hyponyms()

In [None]:
# get parent nodes in synset hierarchy
wordnet.synset('car.n.01').hypernyms()

In [None]:
# get hypernym paths to the root of tree
wordnet.synset('car.n.01').hypernym_paths()

In [None]:
# get root hypernyms of specified synset
wordnet.synset('car.n.01').root_hypernyms()

#### 5.3 More Lexical Relations

In [None]:
"""
Another important way to navigate the WordNet network is from items to their components (meronyms) 
or to the things they are contained in (holonyms)
"""

In [None]:
# the part's(organ) of a word which contains -> in this case 'crown', 'limb', ...
wordnet.synset('tree.n.01').part_meronyms()

In [None]:
# the substance of a word which made of it -> in this case 'heartwood', 'sapwood'
wordnet.synset('tree.n.01').substance_meronyms()

In [None]:
# the word is member of what words -> in this case earth is member of  'solar_system'
wordnet.synset('earth.n.01').member_holonyms()

In [None]:
# verbs which are the consequences of this action -> in this case walk consists of 'step'
wordnet.synset('walk.v.01').entailments()

In [None]:
# contrast of a lemma
wordnet.lemma('supply.n.02.supply').antonyms()

#### 5.4 Semantic Similarity

In [None]:
# get lowest common hypernyms in hierarchy of two synsets
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')
cat.lowest_common_hypernyms(dog)

In [None]:
# get min depth of synset in hierarchy of synsets
wordnet.synset('cat.n.01').min_depth()

In [None]:
"""
path_similarity assigns a score in the range 0–1 based on the shortest path
that connects the concepts in the hypernym hierarchy 
(-1 is returned in those cases where a path cannot be found)
"""
engineer = wordnet.synset('engineer.n.01')
architect = wordnet.synset('architect.n.01')
architect.path_similarity(engineer)

-----

# <font color='red'>Jacob Book</font>

## <font color='blue'>Chapter 1 : Tokenizing Text and WordNet Basics</font>

#### 1.2 Tokenizing text into sentences

In [None]:
"""
This tokenizer divides a text into a list of sentences, 
by using an unsupervised algorithm to build a model for abbreviation words
, collocations, and words that start sentences
"""
from nltk.tokenize import sent_tokenize

In [None]:
# sent tokenizer
sent_tokenize('Hi students! Welcome to our course.')

In [None]:
# create a PunktSentenceTokenizer instance to reuse our tokenizer
import nltk.data
tokenizer = nltk.data.load('tokenizers\punkt\english.pickle')
tokenizer.tokenize('Hi students! Welcome to our course.')

##### 1.2.2 Tokenizing Farsi text into sentences

In [None]:
# farsi sent tokenizer
from hazm import sent_tokenize
para = 'سلام. این متن، جهت تست نوشته شده است'
sent_tokenize(para)

#### 1.3 Tokenizing sentences into words

In [None]:
# version 1 word tokenizer
from nltk.tokenize import word_tokenize
word_tokenize('Hello world!')

In [None]:
# verion 2 word tokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('Hello world!')

In [None]:
# version 3 word tokenizer
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("can't is a contraction.")

In [None]:
# farsi word tokenizer
from hazm import word_tokenize
word_tokenize('این جمله، برای تست نوشته شده است.')

#### 1.4 Tokenizing sentences using regular expressions

In [None]:
# version 1 regex tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("can't is a contraction.")

In [None]:
# version 2 regex tokenizer (without instansiation)
from nltk.tokenize import regexp_tokenize
regexp_tokenize("can't is a contraction.", "[\w']+")

In [None]:
# regex tokenizer split's on gaps
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\s+', gaps = True)
tokenizer.tokenize("can't is a contraction.")

#### 1.5 Training a sentence tokenizer

In [None]:
# using webtext corpus
# PunktSentenceTokenizer uses an unsupervized ML algorithm
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
sent_tokenizer = PunktSentenceTokenizer(text)
sent_tokenizer.tokenize("Hello! aren't you ready for battle?")

In [None]:
# reading text version 2
with open('C:\\Users\\Mahdi\\AppData\\Roaming\\nltk_data\\corpora\\webtext\\overheard.txt', encoding='ISO-8859-2') as f:
    text = f.read()
sent_tokenizer = PunktSentenceTokenizer(text)

#### 1.6 Filtering stopwords in a tokenized sentence

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
words = word_tokenize("Here is a good city")
[w for w in words if w not in stopwords.words('english')]

#### 1.7 Looking up Synsets for a word in WordNet

In [None]:
# get POS of a synset
wordnet.synset('book.n.01').pos()

In [None]:
# get synset with specified POS
wordnet.synsets('great')
wordnet.synsets('great', pos = 'n')
wordnet.synsets('great', pos = 'a')

#### 1.8 Looking up lemmas and synonyms in WordNet

#### 1.9 Calculating WordNet Synset similarity

In [None]:
# version 1 similarity measure
dog = wordnet.synset('dog.n.01')
cat = wordnet.synset('cat.n.01')
cat.wup_similarity(dog)

In [None]:
# shortest path distance between two synsets
cat.shortest_path_distance(dog)

In [None]:
# version 2 similarity measure
cat.lch_similarity(dog)

#### 1.10 Discovering word collocations

In [None]:
# BigramCollocationFinder
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
words = [w.lower() for w in webtext.words('grail.txt')]
bcf = BigramCollocationFinder.from_words(words)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)  # get top 4 collocations of the text

In [None]:
# first 4 bigram collocation along with their score as a tuple of (collocation, score)
bcf.score_ngrams(BigramAssocMeasures.likelihood_ratio)[:4]

In [None]:
# get bigram collocations above specific score
list(bcf.above_score(BigramAssocMeasures.likelihood_ratio, 220))

In [None]:
# filter stopwords in BigramCollocationFinder
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset  # this will filter words which are stopwords or smaller than 3
bcf.apply_word_filter(filter_stops)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)  # get top 4 collocations of the text

In [None]:
# TrigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
words = [w.lower() for w in webtext.words('singles.txt')]
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_word_filter(filter_stops)  # fiter stopwords
tcf.apply_freq_filter(3)  # filter less frequent words
tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)  # get top 4 collocations of the text

------

## <font color='blue'>Chapter 2 : Replacing and Correcting Words</font>

#### 2.1 Stemming words 

In [None]:
# Stemming is to remove affixes from words

In [None]:
# porter stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('cookery')  # output 'cookeri'

In [None]:
# lancaster stemmer
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('cookery')  # output 'cookery'

In [None]:
# farsi stemmer
from hazm import Stemmer
stemmer = Stemmer()
stemmer.stem('میرویم')  # output 'میرو'

In [None]:
# regex stemmer that removes any prefix or suffix that matches the expression
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
stemmer.stem('booking')  # output 'book'

In [None]:
# snowball stemmer contains 13 languages along with 2 english stemmer
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('spanish')
stemmer.stem('mahalo')  # output 'mahal'

In [None]:
# languages used in snowball stemmer (class variable)
from nltk.stem import SnowballStemmer
stemmer.languages

#### 2.2 Lemmatizing words with WordNet

In [None]:
"""
lemma is root of a word
unlike stem it's a valid word
look at the meaning
"""

In [None]:
# wordnet lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('cooking')

In [None]:
# default pos for lemmatize method is pos=NOUN
lemmatizer.lemmatize('cooking', pos='v')

In [None]:
# farsi lemmatizer
from hazm import Lemmatizer
lemmatizer = Lemmatizer()
lemmatizer.lemmatize('می‌روم')

#### 2.3 Replacing words matching regular expressions

In [None]:
# regexp replacer
import re
replacement_patterns = [
     (r'won\'t', 'will not'),
     (r'can\'t', 'cannot'),
     (r'([i,I])\'m', '\g<1> am'),
     (r'ain\'t', 'is not'),
     (r'(\w+)\'ll', '\g<1> will'),
     (r'(\w+)n\'t', '\g<1> not'),
     (r'(\w+)\'ve', '\g<1> have'),
     (r'(\w+)\'s', '\g<1> is'),
     (r'(\w+)\'re', '\g<1> are'),
     (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s

In [None]:
# regexp replacer usage
replacer = RegexpReplacer(replacement_patterns)
replacer.replace("i'm not gonna do that, you can't be serious, i'd rather not to go there")

#### 2.4 Removing repeating characters

In [None]:
# repeat replacer
import re
from nltk.corpus import wordnet

class RepeatReplacer:
  def __init__(self):
    self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    self.repl = r'\1\2\3'
  def replace(self, word):
    if wordnet.synsets(word):
        return word
    repl_word = self.repeat_regexp.sub(self.repl, word)
    if repl_word != word:
      return self.replace(repl_word)
    else:
      return repl_word

In [None]:
# repeat replacer usage
repeat_replacer = RepeatReplacer()
repeat_replacer.replace('goooood')

#### 2.5 Spelling correction with Enchant

In [None]:
# spell checker
import enchant
from nltk.metrics import edit_distance
class SpellingReplacer(object):
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

In [None]:
# spell checker usage
spell_checker = SpellingReplacer()
spell_checker.replace('cookbok')

In [None]:
# enchant list languages
enchant.list_languages()

In [None]:
# enchant check if language exist
enchant.dict_exists('en_GB')

In [None]:
# enchant check word correctness
import enchant
enchant.Dict('en_US').check('theater')

In [None]:
# custom spell checker
class CustomSpellingReplacer(SpellingReplacer):
    def __init__(self, spell_dict, max_dist=2):
        self.spell_dict = spell_dict
        self.max_dist = max_dist

In [None]:
# custom spell checker with personal word list
d = enchant.DictWithPWL('en_US', 'my_words.txt')
replacer = CustomSpellingReplacer(d)
replacer.replace('bok')

#### 2.6 Replacing Synonyms

In [None]:
# base class for word replacer -> this class replace a word by it's synonym from our dictionary
class WordReplacer(object):
    def __init__(self, word_map):
        self.word_map = word_map
    def replace(self, word):
        return self.word_map.get(word, word)  # dict.get(key, default=None)

In [None]:
# base word replacer example usage
word_replacer = WordReplacer({'bday': 'birthday'})
word_replacer.replace('bday')

In [None]:
# word replacer from csv file
import csv

class CsvWordReplacer(WordReplacer):
    def __init__(self, fname):
        word_map = {}
        for line in csv.reader(open(fname)):
            word, syn = line
            word_map[word] = syn
        super().__init__(word_map)

In [None]:
# csv word replacer example usage
replacer = CsvWordReplacer('csv_test.csv')
replacer.replace('lol')

In [None]:
# word replacer from yaml file
import yaml

class YamlWordReplacer(WordReplacer):
    def __init__(self, fname):
        word_map = yaml.load(open(fname))
        super().__init__(word_map)

In [None]:
# yaml word replacer example usage
replacer = YamlWordReplacer('yaml_test.yaml')
replacer.replace('bday')

#### 2.7 Replacing negations with antonyms

In [None]:
# antonym replacer -> replace negative words with positive ones
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1: # unambiguous replacement
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, orig_sent):
        sent = word_tokenize(orig_sent)
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words

In [None]:
# Antonym replacer example usage
replacer = AntonymReplacer()
replacer.replace_negations("Let's not uglify our code!")

In [None]:
# inherit __init__ and replace from WordReplacer and replace_negations from AntonymReplacer
class AntonymWordReplacer(WordReplacer, AntonymReplacer):
    pass

In [None]:
# AntonymWordReplacer example usage
replacer = AntonymWordReplacer({'ruin':'build'})
replacer.replace_negations("Let's not ruin our code!")

-----------

## <font color='blue'>Chapter 4 : Part-of-speech Tagging</font>

#### 4.2 Default tagging

In [None]:
# for every word will give default specified tag in constructor
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['This', 'is', 'just', 'for', 'test'])  # output -> [('This', 'NN'), ('is', 'NN'), ...]

In [None]:
# treebank corpus which is a tagged corpus
from nltk.corpus import treebank

In [None]:
# get treebank sents
treebank.sents()  # output -> [['w', 'w', 'w', ...], [], ...]

In [None]:
# get treebank tagged sents
treebank.tagged_sents()  # output -> [[('w', 'tag'), ('w', 'tag'), ...], [], ...]

In [None]:
# evaluate default tagger with treebank tagged corpus
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)  # output -> 0.143

In [None]:
# tagging sentences with default tagger
tagger.tag_sents([['Hi', 'students'], ['How', 'are', 'you']])

In [None]:
# untagging a tagged sentence
from nltk.tag import untag
untag([('Hello', 'NN'), ('students', 'NN')])

#### 4.3 Training a unigram part-of-speech tagger

In [None]:
# unigram tagger 
from nltk.tag import UnigramTagger
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)

In [None]:
# tag with unigram tagger
tagger.tag(treebank.sents()[0])

In [None]:
# evaluate unigram tagger
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)  # output -> 0.858

In [None]:
# override context model
tagger = UnigramTagger(model={'Pierre': 'NN'})

In [None]:
# set minimum frequency cutoff for unigram tagger
tagger = UnigramTagger(train_sents, cutoff=3)

#### 4.4 Combining taggers with backoff tagging

In [None]:
# backoff will be used for those words which don't get any tag with one tagger
# in this case untagged words goes to next tagger method
tagger1 = DefaultTagger('NN')
tagger2 = UnigramTagger(train_sents, backoff=tagger1)
tagger2.evaluate(test_sents)

In [None]:
# get taggers in used by this object
tagger1._taggers

In [None]:
# check if taggers in use are equal to specified taggers
tagger2._taggers == [tagger2, tagger1]

In [None]:
# save and load a trained tagger as pickle file
import pickle
f = open('tagger.pickle', 'wb')
pickle.dump(tagger, f)  # save model
f.close()
f = open('tagger.pickle', 'rb')
tagger = pickle.load(f)  # load model

#### 4.5 Training and combining ngram taggers

In [None]:
"""
NgramTagger childs -> UnigramTagger, BigramTagger, TrigramTagger
NgramTagger parent tree -> ContextTagger -> SequentialBackoffTagger
""" 

In [None]:
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger

In [None]:
# BigramTagger usage
bitagger = BigramTagger(train_sents)

In [None]:
# TrigramTagger usage
tritagger = TrigramTagger(train_sents)

In [None]:
# chain taggers together one after another with this backoff tagger method
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff  # last class of list will be instantiated and returned

In [None]:
# chain taggers together
from nltk.tag import DefaultTagger
backoff = DefaultTagger('NN')
# DefaultTagger -> backoff of UnigramTagger, UnigramTagger -> backoff of BigramTagger, ... 
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)

In [None]:
# get context tags of BigramTagger
bitagger._context_to_tag  # output -> {((previous_tag,), 'word'):'tag', ():'', ...}

In [None]:
# QuadgramTagger
from nltk.tag import NgramTagger

quadtagger = NgramTagger(4, train_sents)

In [None]:
# QuadgramTagger class to enable us to use it as backoff tagger
class QuadgramTagger(NgramTagger):
    def __init__(self, *args, **kwargs):
        NgramTagger.__init__(self, 4, *args, **kwargs)

In [None]:
# QuadgramTagger usage in backoff_tagger
quadtagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger, QuadgramTagger], backoff=backoff)
quadtagger.evaluate(test_sents)

In [None]:
# note : too much context can have a negative effect on accuracy

#### 4.6 Creating a model of likely word tags

In [None]:
# create model for most common words
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.corpus import treebank

def word_tag_model(words, tagged_words, limit=1000):
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(tagged_words)
    most_freq = (word for word, count in fd.most_common(limit))
    return dict((word, cfd[word].max()) for word in most_freq)

In [None]:
# likely tagger usage
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, DefaultTagger

model = word_tag_model(treebank.words(), treebank.tagged_words())
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN'))
likely_tagger = UnigramTagger(model=model, backoff=tagger)  # use custom model of most common words
likely_tagger.evaluate(test_sents)

#### 4.7 Tagging with regular expressions

In [None]:
# regexp tagger
from nltk.tag import RegexpTagger

patterns = [
     (r'^\d+$', 'CD'),  # cardinal number , ex. 123
     (r'.*ing$', 'VBG'), # gerunds, ex. wondering
     (r'.*ment$', 'NN'), # noun, ex. wonderment
     (r'.*ful$', 'JJ'), # adjective, ex. wonderful
     (r'.*', 'NN')  # this statement will reach if previous ones don't match , act's like DefaultTagger
]
tagger = RegexpTagger(patterns)
tagger.evaluate(test_sents)

In [None]:
# regexp tagger is subclass of SequentialBackoffTagger, it means we can chain it to another tagger
from nltk.tag import RegexpTagger, UnigramTagger, BigramTagger, TrigramTagger, DefaultTagger

regexp_tagger = RegexpTagger(patterns, backoff=DefaultTagger('NN'))
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=regexp_tagger)
tagger.evaluate(test_sents)

#### 4.8 Afix tagging

In [None]:
"""
AfixTagger is subclass of ContextTagger
the context is either the prefix or the suffix of a word
AffixTagger class is able to learn tags based on fixed-length substrings of the beginning or ending of a word
words must be at least five characters long. If a word is less than five characters, then None is returned as the tag
"""

In [None]:
# with default arguments
from nltk.tag import AffixTagger
tagger = AffixTagger(train_sents)

In [None]:
prefix_tagger = AffixTagger(train_sents, affix_length = 3)  # three character prefix

In [None]:
suffix_tagger = AffixTagger(train_sents, affix_length = -2)  # two character suffix

In [None]:
# To view the dictionary that maps contexts to tag
tagger._context_to_tag

In [None]:
# combine AffixTagger as backoff chain
pre3 = AffixTagger(train_sents, affix_length=3)
pre2 = AffixTagger(train_sents, affix_length=2, backoff=pre3)

In [None]:
# this acts like unigram tagger (all the word is prefix)
# if len of word is less than (min_stem_length + affix_length) then None is returned
tagger = AffixTagger(train_sents, min_stem_length=0, affix_length=0)  # default value of min_stem_length = 2
tagger.evaluate(test_sents)

#### 4.9 Training a Brill tagger

In [None]:
"""
brill tagger is not subclass of SequentialBackoffTagger
it uses some rules to correct the result of initial tagger
"""

In [None]:
# brill tagger
from nltk.tag import brill, brill_trainer # two modules

def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    templates = [
       brill.Template(brill.Pos([-1])),
       brill.Template(brill.Pos([1])),
       brill.Template(brill.Pos([-2])),
       brill.Template(brill.Pos([2])),
       brill.Template(brill.Pos([-2, -1])),
       brill.Template(brill.Pos([1, 2])),
       brill.Template(brill.Pos([-3, -2, -1])),
       brill.Template(brill.Pos([1, 2, 3])),
       brill.Template(brill.Pos([-1]), brill.Pos([1])),
       brill.Template(brill.Word([-1])),
       brill.Template(brill.Word([1])),
       brill.Template(brill.Word([-2])),
       brill.Template(brill.Word([2])),
       brill.Template(brill.Word([-2, -1])),
       brill.Template(brill.Word([1, 2])),
       brill.Template(brill.Word([-3, -2, -1])),
       brill.Template(brill.Word([1, 2, 3])),
       brill.Template(brill.Word([-1]), brill.Word([1]))
    ]
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True)
    return trainer.train(train_sents, **kwargs) # returns an instance of the BrillTagger class

In [None]:
# brill tagger usage
default_tagger = DefaultTagger('NN')
initial_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger)
brill_tagger = train_brill_tagger(initial_tagger, train_sents)
brill_tagger.evaluate(test_sents)

#### 4.10 Training the TnT tagger

In [None]:
"""
TnT stands for Trigrams'n'Tags. It is a statistical tagger based on second order Markov models.
based on probability
uses all the ngram models together to choose the best tag
"""

In [None]:
from nltk.tag import tnt
tnt_tagger = tnt.TnT()  # kwarg C is for make Capitalization significant, default is C=False (don't your capitalization)
tnt_tagger.train(train_sents) # different from the previous tagger, you should explicitly call train() method
tnt_tagger.evaluate(test_sents)

In [None]:
# for tag unknown words you should use unseen tagger like -> DefaultTagger, AffixTagger, RegexpTagger
unk = DefaultTagger('NN')  # tagger for unknown words
tnt_tagger = tnt.TnT(unk=unk, Trained=True)  # Trained=True means that already trained
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

#### 4.11 Using WordNet for tagging

In [None]:
"""
WordNet can be useful for tagging unknown words
mapping of wordnet tags to treebank tags
v -> VB
n -> NN
a -> JJ
s -> JJ
r -> RB
"""

In [None]:
# wordnet tagger
from nltk.corpus import wordnet
from nltk.tag import SequentialBackoffTagger
from nltk.probability import FreqDist

class WordNetTagger(SequentialBackoffTagger):
    '''
     >>> wt = WordNetTagger()
     >>> wt.tag(['food', 'is', 'great'])
     [('food', 'NN'), ('is', 'VB'), ('great', 'JJ')]
     '''
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        self.wordnet_tag_map = {
            'n': 'NN', 
            's': 'JJ', 
            'a': 'JJ', 
            'r': 'RB', 
            'v': 'VB' 
        }
    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        fd = FreqDist()
        for synset in wordnet.synsets(word):
            fd[synset.pos()] += 1
        if len(fd)==0:
            return 'NN'
        return self.wordnet_tag_map.get(fd.max())

In [None]:
# wordnet tagger example usage
wnt = WordNetTagger()
wnt.tag(['food', 'is', 'great'])

In [None]:
# use wordnet in backoff chain
tagger = backoff_tagger(train_sents, tagger_classes=[UnigramTagger, BigramTagger, TrigramTagger], backoff=wnt)
tagger.evaluate(test_sents)

#### 4.12 Tagging proper names

In [None]:
# this class could be chained
from nltk.corpus import names
from nltk.tag import SequentialBackoffTagger

class NamesTagger(SequentialBackoffTagger):
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        self.name_set = set([w.lower() for w in names.words()])
    def choose_tag(self, tokens, index, history):
        if tokens[index].lower() in self.name_set:
            return 'NNP' # proper Noun
        return None

In [None]:
# NamesTagger example usage
nt = NamesTagger()
nt.tag(['James', 'book'])

#### 4.13 Classifier-based tagging

In [None]:
"""
this class implements a feature detector
The feature detector finds multiple length suffixes, does some regular expression matching, 
and looks at the unigram, bigram, and trigram history to produce a fairly complete set of features for each word
""" 

In [None]:
from nltk.tag.sequential import ClassifierBasedPOSTagger
tagger = ClassifierBasedPOSTagger(train=train_sents) # the first argument of the constructor is not 'train'
tagger.evaluate(test_sents)

In [None]:
# Note: this is really slow
# use custom classifier
from nltk.classify import MaxentClassifier
me_tagger = ClassifierBasedPOSTagger(train=train_sents, classifier_builder=MaxentClassifier.train)  # NaiveBayes is default

In [None]:
# custom feature detector
from nltk.tag.sequential import ClassifierBasedTagger

def unigram_feature_detector(tokens, index, history):
    return {'word':tokens[index]} # should return a dictionary of feature-name:feature-value

tagger = ClassifierBasedTagger(train=train_sents, feature_detector=unigram_feature_detector)
tagger.evaluate(test_sents)

In [None]:
# classifier tagger tags everything, until you use cutoff_prob and chain a backoff 
default = DefaultTagger('NN')
tagger = ClassifierBasedPOSTagger(train=train_sents, cutoff_prob=0.3, backoff=default)
tagger.evaluate(test_sents)

In [None]:
# ClassifierBasedPOSTagger constructor has a parameter "classifier" which can be used for pretrained classifier
# in this case "classifier_builder" will be ignored

#### 4.14 Training a tagger with NLTK-Trainer

In [None]:
"""
it's impossible to know which methods and parameters will work best without doing training experiments. 
But training experiments can be tedious, since they often involve many small code changes (and lots of cut and paste). 
The author of this book (Jacob Perkins) created a project called NLTK-Trainer.
The project is available on GitHub at https://github.com/japerk/nltk-trainer.
In the terminal, go to the cloned folder and write the following command:
python train_tagger.py treebank
Look closely at the second line of output: 3914 tagged sents, training on 3914. 
This is a very misleading way to evaluate any trained model. 
To train on %75 of the corpus and test on other %25, write the following command (also to skip dumping a pickle file):
python train_tagger.py treebank --fraction 0.75 --no-pickle
The first argument to the script is corpus. 
This could be the name of an NLTK corpus that can be found in the nltk.corpus module, such as treebank or brown. 
It could also be the path to a custom corpus directory. 
If it's a path to a custom corpus, then you'll also need to use the --reader argument to specify the corpus reader class,
such as nltk.corpus.reader.tagged.TaggedCorpusReader.
The default training algorithm is aubt, which is shorthand for a sequential backoff tagger composed of 
AffixTagger + UnigramTagger + BigramTagger + TrigramTagger.
You can determine other training algorithms and other options. Why not test it yourself?! Read pages 116-121
"""

----

## <font color='blue'>Chapter 7 : Text Classification</font>

In [None]:
"""
A binary classifier decides between two labels, such as spam detection. The text can either be one label or another, but not both.
A multi-class classifier decides between three or more labels, such as topic detection.
A multi-label classifier can assign one or more labels to a piece of text.
classifier input corpus format should be -> [(featureset, label)] which featureset is a dict and label is it's class
"""

##### 7.1 Bag of words feature extraction

In [None]:
"""
in this chapter we want to classify a text, so we need feature set of a text
a Bag of words is simply a dictionary of words presence which it's key is word and value is True
format of Bag of words -> {'word1': True, 'word2': True, 'word3':True, ...}
""" 

In [None]:
# bag of words method
def bag_of_words(words):
    return dict([(word, True) for word in words])

In [None]:
# exclude bad words from bag of words
def bag_of_words_not_in_set(words, badwords):
    return bag_of_words(set(words) - set(badwords))

In [None]:
# include good words with bag of words
def bag_of_words_in_set(words, goodwords):
    return bag_of_words(set(words) & set(goodwords))

In [None]:
# exclude stop words from bag of words
from nltk.corpus import stopwords

def bag_of_non_stopwords(words, stopfile='english'):
    badwords = stopwords.words(stopfile)
    return bag_of_words_not_in_set(words, badwords)

In [None]:
# bag of words along with most significant bigrams of the text
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)

#### 7.2 Training a Naive Bayes classifier

In [None]:
# sentiment analysis of movie_reviews corpus
# movie_reviews corpus has two label -> 'pos', 'neg'
from nltk.corpus import movie_reviews
movie_reviews.categories()

In [None]:
# label feats from corpus
# point is to extract features from text and turn them to an input format of Classification algorithm
# output format -> {label: [featureset]}
from collections import defaultdict
   
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
    label_feats = defaultdict(list)
    for label in corp.categories():
        for fileid in corp.fileids(categories=[label]):
            feats = feature_detector(corp.words(fileids=[fileid]))
            label_feats[label].append(feats)
    return label_feats  # output format -> {'pos':[{bag_of_words}, {bag_of_words}, {}, ...], 'neg':[]}

In [None]:
# defaultdict is a dict which will return an empty value where you want to access to nonexist keys
# defaultdict example 1
s = 'mississippi'
d = defaultdict(int)
for k in s:
    d[k] += 1
d.items()
# d['m'] -> 1
# d['q'] -> 0

In [None]:
# defaultdict example 2
s = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
d = defaultdict(list)
for k, v in s:
    d[k].append(v)
d.items()

In [None]:
# create two train and test lists by spliting labelfeats
def split_label_feats(lfeats, split=0.75):
    train_feats = []
    test_feats = []
    for label, feats in lfeats.items():
        cutoff = int(len(feats) * split)
        train_feats.extend([(feat, label) for feat in feats[:cutoff]]) # it is better to first shuffle data
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])
    return train_feats, test_feats  # output format -> [({bag of words}, 'pos/neg'), (), (), (),  ...]

In [None]:
# split label feats example usage
from nltk.corpus import movie_reviews
lfeats = label_feats_from_corpus(movie_reviews)

train_feats, test_feats = split_label_feats(lfeats)
print("%d %d" %(len(train_feats),len(test_feats)))

In [None]:
# NaiveBayesClassifier example usage
from nltk.classify import NaiveBayesClassifier
nb = NaiveBayesClassifier.train(train_feats)
nb.labels()

In [None]:
# classify unseen instance
feat = bag_of_words(['the', 'plot', 'was', 'fantastic'])
nb.classify(feat)

In [None]:
# evaluate accuracy of classifier
from nltk.classify.util import accuracy
accuracy(nb, test_feats)

In [None]:
# get classification probabilities for an instance
probs = nb.prob_classify(test_feats[0][0]) # test_feats[0] is a tuple: (feats, label)
probs.samples()  # classifier labels
probs.prob('neg')  # probability of neg for this instance
probs.max()  # max probability for this instance

In [None]:
""" Classifier most informative words
More informative features are those that occur primarily in one label and not on the other. 
The less informative features are those that occur frequently with both labels
"""
nb.most_informative_features(n=5)  # output format -> [(feature, value), (), (), ...]

In [None]:
# Classifier most informative words along with probabilities for each label
nb.show_most_informative_features(n=5)

In [None]:
# NaiveBayesClassifier manual training
from nltk.probability import DictionaryProbDist

label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})
true_probdist = DictionaryProbDist({True: 1})
feature_probdist = {('pos', 'yes'):true_probdist, ('neg', 'no'):true_probdist}
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

In [None]:
# manual NaiveBayesClassifier example usage
classifier.classify({'yes': True})  # output -> 'pos'
classifier.classify({'no': True})  # output -> 'neg'

#### 7.3 Training a decision tree classifier

In [None]:
# Decision Tree Classifier
# support_cuttoff: The minimum number of instances that are required to make a decision about a feature.
# binary=True because all features are binary
from nltk.classify import DecisionTreeClassifier
from nltk.classify.util import accuracy
dt = DecisionTreeClassifier.train(train_feats, binary=True, 
                                  entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
accuracy(dt, test_feats)

In [None]:
# Decision Tree calculate entropy manualy
# remember : most informative decision has less entropy
from nltk.probability import FreqDist, MLEProbDist, entropy
fd = FreqDist({'pos':30, 'neg':10})
entropy(MLEProbDist(fd))  # output -> 0.811

#### 7.4 Training scikit-learn classifiers

In [None]:
# SkLearn Classifier version 1
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB # Choose and import an sklearn algorithm

sk_classifier = SklearnClassifier(MultinomialNB()) # Construct an SklearnClassifier class with the chosen algorithm
sk_classifier.train(train_feats) # Train the SklearnClassifier class with your training features
accuracy(sk_classifier, test_feats)

In [None]:
# Sklearn Classifier version 2
# Bernoulli discret classifier based on NaiveBayes, bernoulli is binary 0 or 1
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB

sk_classifier = SklearnClassifier(BernoulliNB())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

In [None]:
# Sklearn Classifier version 3
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression

sk_classifier = SklearnClassifier(LogisticRegression())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

In [None]:
# SVM Classifier version 1
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

sk_classifier = SklearnClassifier(SVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

In [None]:
# SVM Classifier version 2
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC

sk_classifier = SklearnClassifier(LinearSVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

In [None]:
# SVM Classifier version 3
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import NuSVC

sk_classifier = SklearnClassifier(NuSVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

#### 7.5 Measuring precision and recall of a classifier

In [None]:
# precision recall
import collections
from nltk.metrics import precision, recall

def precision_recall(classifier, testfeats): # the same arguments you pass to accuracy()
    refsets = collections.defaultdict(set)  # real output
    testsets = collections.defaultdict(set)  # system output 
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    precisions = {}
    recalls = {}
    for label in classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label]) # len(reference.intersection(test)) / len(test)
        recalls[label] = recall(refsets[label], testsets[label])# len(reference.intersection(test)) / len(reference)
    return precisions, recalls  # output format -> {'label1': float, 'label2': float, 'label3':float, ...}

In [None]:
# precision recall example usage
nb_precisions, nb_recalls = precision_recall(nb, test_feats)
nb_precisions['pos']
nb_recalls['neg']

#### 7.6 Considering high information words

In [None]:
# A high information word is a word that is strongly biased towards a single classification label.

In [None]:
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.collections import defaultdict

# labeled_words must be this format -> [('label', ['w', 'w', ...]), (), (),...]
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labeled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
    n_xx = label_word_fd.N() # the total frequency for all words in all labels
    high_info_words = set()
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N() # the total frequency of all words that occurred for the label
        word_scores = defaultdict(int)
        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word] # the total frequency of the word across all labels
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score
        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)  # union of two sets
    return high_info_words # output format -> {'w', 'w', 'w', ...}

In [None]:
# high information words example usage
from nltk.corpus import movie_reviews

labels = movie_reviews.categories()
labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
high_info_words = high_information_words(labeled_words)

feat_det = lambda words: bag_of_words_in_set(words, high_info_words)  # custom feature detector
lfeats_with = label_feats_from_corpus(movie_reviews, feature_detector=feat_det)
train_feats_with, test_feats_with = split_label_feats(lfeats_with)

lfeats_without = label_feats_from_corpus(movie_reviews)
train_feats_without, test_feats_without = split_label_feats(lfeats_without)

###### Classifiers WITH high information words

In [None]:
# NaiveBayes
from nltk.classify import NaiveBayesClassifier, accuracy
nb_classifier = NaiveBayesClassifier.train(train_feats_with)
accuracy(nb_classifier, test_feats_with)
nb_precisions_with, nb_recalls_with = precision_recall(nb_classifier, test_feats_with)
nb_precisions_with['pos']
nb_recalls_with['neg']

# Maxent
from nltk.classify import MaxentClassifier
me_classifier = MaxentClassifier.train(train_feats_with, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)
accuracy(me_classifier, test_feats_with)
me_precisions_with, me_recalls_with = precision_recall(me_classifier, test_feats_with)
me_precisions_with['pos']
me_recalls_with['neg']

# DecisionTree
from nltk.classify import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier.train(train_feats_with, binary=True, depth_cutoff=20, 
                                             support_cutoff=20, entropy_cutoff=0.01)
accuracy(dt_classifier, test_feats_with)

# Sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats_with)
accuracy(sk_classifier, test_feats_with)

###### Classifiers WITHOUT high information words

In [None]:
# NaiveBayes
from nltk.classify import NaiveBayesClassifier, accuracy
nb_classifier = NaiveBayesClassifier.train(train_feats_without)
accuracy(nb_classifier, test_feats_without)
nb_precisions_without, nb_recalls_without = precision_recall(nb_classifier, test_feats_without)
nb_precisions_without['pos']
nb_recalls_without['neg']

# Maxent
from nltk.classify import MaxentClassifier
me_classifier = MaxentClassifier.train(train_feats_without, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)
accuracy(me_classifier, test_feats_without)
me_precisions_without, me_recalls_without = precision_recall(me_classifier, test_feats_without)
me_precisions_without['pos']
me_recalls_without['neg']

# DecisionTree
from nltk.classify import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier.train(train_feats_without, binary=True, depth_cutoff=5, 
                                             support_cutoff=30, entropy_cutoff=0.8)
accuracy(dt_classifier, test_feats_without)

# Sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats_without)
accuracy(sk_classifier, test_feats_without)

#### 7.7 Combining classifiers with voting

In [None]:
import itertools
from nltk.classify import ClassifierI
from nltk.probability import FreqDist
class MaxVoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        self._labels = sorted(set(itertools.chain(*[c.labels() for c in classifiers])))
    def labels(self):  # required to implement
        return self._labels
    def classify(self, feats):  # required to implement
        counts = FreqDist()
        for classifier in self._classifiers:
            counts[classifier.classify(feats)] += 1
        return counts.max()

In [None]:
"""
itertools.chain: Make an iterator that returns elements from the first iterable until it is exhausted,
then proceeds to the next iterable, until all of the iterables are exhausted.
Used for treating consecutive sequences as a single sequence
"""
import itertools
a = itertools.chain('ab', 'cd', [1,2])
for i in a:
    print(i, end=' ')  # output -> a b c d 1 2 

In [None]:
a = itertools.chain(['ab', 'cd', [1,2]])
for i in a:
    print(i, end=' ')  # output -> ab cd [1, 2]

In [None]:
a = itertools.chain(*['ab', 'cd', [1,2]])  # uses *args
for i in a:
    print(i, end=' ')  # output -> a b c d 1 2 

In [None]:
# MaxVoteClassifier example usage
mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, me_classifier, sk_classifier)
accuracy(mv_classifier, test_feats_with)
mv_precisions, mv_recalls = precision_recall(mv_classifier, test_feats_with)
mv_precisions['pos']
mv_recalls['neg']

#### 7.8 Multi-label classifiers

In [None]:
# The reuters corpus contains multi-labeled text
# multi label classifier
# multi_label_classifier

In [None]:
from nltk.corpus import reuters
from nltk.metrics import BigramAssocMeasures

def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
    labeled_words = []
    for label in reuters.categories():
        labeled_words.append((label, reuters.words(categories=[label])))
    return high_information_words(labeled_words, score_fn=score_fn)

In [None]:
# extract train and test features from reuters multilabel corpus 
def reuters_train_test_feats(feature_detector=bag_of_words): # we will be overriding this using bag_of_words_in_set()
    train_feats = []
    test_feats = []
    for fileid in reuters.fileids():
        feats = feature_detector(reuters.words(fileid))
        labels = reuters.categories(fileid)
        if fileid.startswith('training'):
            featlist = train_feats  # featlist is a reference variable
        else: # fileid.startswith('test')
            featlist = test_feats
        featlist.append((feats, labels))
    return train_feats, test_feats  # format of output -> [(featureset, [label])]

In [None]:
rwords = reuters_high_info_words()
featdet = lambda words: bag_of_words_in_set(words, rwords)
multi_train_feats, multi_test_feats = reuters_train_test_feats(featdet)

In [None]:
# train binary clssifiers to use in multilabeledClassifer
# trainf is training function
# labelled_feats is a list of multi-label feature sets
# labelset is a set of possible labels
def train_binary_classifiers(trainf, labelled_feats, labelset):
    pos_feats = collections.defaultdict(list)
    neg_feats = collections.defaultdict(list)
    classifiers = {}
    for feat, labels in labelled_feats:
        for label in labels:
            pos_feats[label].append(feat)
        for label in labelset - set(labels):
            neg_feats[label].append(feat)
    for label in labelset:
        postrain = [(feat, label) for feat in pos_feats[label]]
        negtrain = [(feat, '!%s' % label) for feat in neg_feats[label]]
        classifiers[label] = trainf(postrain + negtrain)
    return classifiers  # binary classifiers

In [None]:
# train_binary_classifiers example usage
trainf = lambda train_feats: SklearnClassifier(LogisticRegression()).train(train_feats)
labelset = set(reuters.categories())
classifiers = train_binary_classifiers(trainf, multi_train_feats, labelset)

In [None]:
from nltk.classify import MultiClassifierI

class MultiBinaryClassifier(MultiClassifierI):
    def __init__(self, *label_classifiers): # takes a list of labeled classifiers of the form [(label, classifier)]
        self._label_classifiers = dict(label_classifiers)
        self._labels = sorted(self._label_classifiers.keys())
    def labels(self):  # required to implement
        return self._labels 
    def classify(self, feats):  # required to implement
        lbls = set()
        for label, classifier in self._label_classifiers.items():
            if classifier.classify(feats) == label:
                lbls.add(label)
        return lbls

In [None]:
multi_classifier = MultiBinaryClassifier(*classifiers.items())

In [None]:
"""
To evaluate this classifier, we can use precision and recall, but not accuracy.
That's because the accuracy function assumes single values, and doesn't take into account partial matches. 
For example, if the multi_classifier returns three labels for a feature set, 
and two of them are correct but the third is not, then the accuracy() function would mark that as incorrect. 
So, instead of using accuracy, we will use masi distance, which measures the partial overlap between two sets. 
If the masi distance is close to 0, the better the match. But if the masi distance is close to 1, there is little or no overlap
"""

In [None]:
# precision, recall and average masi distance for MultiBinaryClassifier
from nltk.metrics import masi_distance, precision, recall
# multi_classifier is a MultiBinaryClassifier
# test_feats is a multi-label feature set
def multi_metrics(multi_classifier, test_feats):
    mds = []  # holds masi distance 
    refsets = collections.defaultdict(set)  # real output
    testsets = collections.defaultdict(set)  # system output
    for i, (feat, labels) in enumerate(test_feats):
        for label in labels:
            refsets[label].add(i)
        guessed = multi_classifier.classify(feat)
        for label in guessed:
            testsets[label].add(i)
        mds.append(masi_distance(set(labels), guessed))
    avg_md = sum(mds) / len(mds)
    precisions = {}
    recalls = {}
    for label in multi_classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label])
        recalls[label] = recall(refsets[label], testsets[label])
    return precisions, recalls, avg_md

In [None]:
# evaluate MultiBinaryClassifier
multi_precisions, multi_recalls, avg_md = multi_metrics(multi_classifier, multi_test_feats)

In [None]:
len(reuters.fileids(categories=['soybean']))
multi_precisions['soybean']
multi_recalls['soybean']

In [None]:
"""
note : In general, the labels that have more feature sets will have higher precision and recall, 
and those with less feature sets will have lower performance.
"""

-----

# <font color='red'>Epub File</font>

## <font color='blue'>Article Spinner</font>

In [None]:
""" Unsupervised learning
Article Spinning: Taking an existing article that’s very popular, 
and modifying certain words or phrases so that it doesn’t exactly match the original, 
which then prevents the search engines from marking it as duplicate content
"""

In [None]:
# How? Replace words with their synonyms
# We will use the trigram for this. We’ll use the previous word and the next word to predict the current word:
# P(w(i) | w(i-1), w(i+1))

In [None]:
# libraries been used 
import nltk
import numpy as np
from bs4 import BeautifulSoup # Python library for parsing XML and HTML

In [None]:
# extract positive reviews from Amazon reviews
positive_review = BeautifulSoup(open('sorted_data_acl/electronics/positive.review').read(), 'lxml')
positive_review = positive_review.findAll('review_text')

In [None]:
# we will use trigrams to generate new content
trigram = {}  # output format -> {(context):[middleWord1, md2, md3, ...]}
for review in positive_review:
    s = review.text.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens)-2):
        k = (tokens[i], tokens[i+2])  # context
        if k not in trigram: # a better way is to use collections.defaultdict
            trigram[k] = []
        trigram[k].append(tokens[i+1])

In [None]:
# probability vector
from collections import defaultdict

# format -> {(context): {'w': P}}
trigram_probabilities = {} # the values are dictionary from word->probability

for context, center in trigram.items():
    if len(set(center)) > 1: # only do this when there are different options for a middle word
        d = defaultdict(float)
        for word in center:
            d[word] += 1
        n = len(center)
        for word in d:
            d[word] /= n
        trigram_probabilities[context] = d

In [None]:
# generate random sample
import random

def random_sample(d):
    r = random.random()
    cum = 0
    
    for w,p in d.items():
        cum += p
        if r < cum:
            return w

In [None]:
def test_spinner():
    review = random.choice(positive_review)
    s = review.text.lower()
    print('original: %s' %s)
    
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens)-2):
        if random.random()<0.2: # 20% chance of replacement
            k = (tokens[i], tokens[i+2])
            if k in trigram_probabilities:
                w = random_sample(trigram_probabilities[k])
                tokens[i+1] = w
    print(" ".join(tokens)
          .replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))

In [None]:
# Article Spinner usage
test_spinner()

----