# **References**
This tutorial is based on https://github.com/hb20007/hands-on-nltk-tutorial

# 1.1 **Downloading NLTK Libraries:** *Getting ready to start!*

In [4]:
# We install and import necessary python libraries

# !pipenv install nltk TwitterSearch unidecode langdetect langid gensim tweepy

import nltk # https://www.nltk.org/install.html
import numpy # https://www.scipy.org/install.html
import matplotlib.pyplot # https://matplotlib.org/downloads.html
import tweepy # https://github.com/tweepy/tweepy
import TwitterSearch # https://github.com/ckoepp/TwitterSearch
import unidecode # https://pypi.python.org/pypi/Unidecode
import langdetect # https://pypi.python.org/pypi/langdetect
import langid # https://github.com/saffsd/langid.py
import gensim # https://radimrehurek.com/gensim/install.html


# Nltk has many extra functionalities that can be downloaded

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('reuters')
nltk.download('wordnet')
nltk.download('words')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('maxent_ne_chunker')
nltk.download('names')

!apt-get install unzip
!unzip /root/nltk_data/corpora/reuters.zip -d /root/nltk_data/corpora
#!unzip /root/nltk_data/corpora/stopwords.zip -d /root/nltk_data/corpora
#from nltk.corpus import reuters



[nltk_data] Downloading package punkt to /home/kky8822/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kky8822/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package reuters to /home/kky8822/nltk_data...
[nltk_data] Downloading package wordnet to /home/kky8822/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package words to /home/kky8822/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package brown to /home/kky8822/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kky8822/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /home/kky8822/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[n

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?
unzip:  cannot find or open /root/nltk_data/corpora/reuters.zip, /root/nltk_data/corpora/reuters.zip.zip or /root/nltk_data/corpora/reuters.zip.ZIP.


# 1.2 **Text Analysis Using nltk:** *Extracting interesting data from a given text*

In [None]:
from nltk.tokenize import word_tokenize
from nltk.text import Text

In [None]:
my_string = "Two plus two is four, minus one that's three — quick maths. Every day man's on the block. Smoke trees. See your girl in the park, that girl is an uckers. When the thing went quack quack quack, your men were ducking! Hold tight Asznee, my brother. He's got a pumpy. Hold tight my man, my guy. He's got a frisbee. I trap, trap, trap on the phone. Moving that cornflakes, rice crispies. Hold tight my girl Whitney."
tokens = word_tokenize(my_string)
tokens = [word.lower() for word in tokens]
tokens[:5]

In [None]:
t = Text(tokens)
t

This method of converting raw strings to NLTK `Text` instances can be used when reading text from a file. For instance:
```python
f = open('my-file.txt','rU') # Opening a file with the mode 'U' or 'rU' will open a file for reading in universal newline mode. All three line ending conventions will be translated to a "\n"
raw = f.read()
```

In [None]:
# concordance() is a method of the Text class of NLTK. It finds words and displays a context window. Word matching is not case-sensitive.
t.concordance('uckers') 
# concordance() is defined as follows: concordance(self, word, width=79, lines=25). Note default values for optional params.

In [None]:
# Collocations are expressions of multiple words which commonly co-occur.
t.collocations() # def collocations(self, num=20, window_size=2). num is the max no. of collocations to print.

In [None]:
t.count('quack')

In [None]:
t.index('two')

In [None]:
t.similar('brother') # similar(self, word, num=20). Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first.

In [None]:
t.dispersion_plot(['man', 'thing', 'quack']) # Reveals patterns in word positions. Each stripe represents an instance of a word, and each row represents the entire text.

In [None]:
t.plot(20) # plots 20 most common tokens

In [None]:
t.vocab()

Another thing that might be useful in analysis is finding common contexts. Our text is too small so we will use a bigger one.

NLTK comes with several interesting **corpora**, which are large collections of text. You can check out what kinds of corpora are found in `nltk.corpus` in Section 1 [here](http://www.nltk.org/book/ch02.html).

`reuters` is a corpus of news documents. More specifically, `reuters` is a *corpus reader* for the Reuters corpus which provides us with methods to access the corpus:

In [None]:
from nltk.corpus import reuters
text = Text(reuters.words()) # .words() is one method corpus readers provide for reading data from a corpus. We will learn more about these methods in Chapter 2.
text.common_contexts(['August', 'June']) # It seems that .common_contexts() takes 2 words which are used similarly and displays where they are used similarly. It also seems that '_' indicates where the words would be in the text.

We will further explore the Reuters corpus as well as several others in later chapters.

# 1.3 Hands-on: Try NLTK functions on Shakespeare's poems

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O shakespeare.txt
shakespeare = open("shakespeare.txt").readlines() 



In [None]:
len(shakespeare)

In [None]:
for line in shakespeare[:50]:
  print(line)

In [None]:
small_shakespeare = shakespeare[500:2000]

In [None]:
# This is a list of sentences
small_shakespeare[:5] 

In [None]:
# We want to get the tokens
small_shakespeare_tokens = []

# Give it a try:
small_shakespeare_tokens = word_tokenize(' '.join(small_shakespeare))

In [None]:
t = Text(small_shakespeare_tokens)
small_shakespeare_tokens[:5]

In [None]:
t.collocations()

# 1.4 More NLTK: Advanced Functions

These are from the [Official NLTK Documentation](https://www.nltk.org/howto).

## Collocations advanced

In [None]:
#import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
nltk.download('genesis')

In [None]:
finder = BigramCollocationFinder.from_words(nltk.corpus.genesis.words('english-web.txt'))

In [None]:
# https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf
# Mutual information (MI) is a measure of the information overlap between two random variables
# Pointwise mutual information (PMI, 5) is a measure of 
# how much the actual probability of a particular co-occurrence of events p(x, y) 
# differs from what wewould expect it to be on the basis of the probabilities of the individual events and the assumption of independence p(x)p(y)

In [None]:
# top ten bigram collocations in Genesis
finder.nbest(bigram_measures.pmi, 10)

In [None]:
# While these words are highly collocated, the expressions are also very infrequent. 
# Therefore it is useful to apply filters, such as ignoring all bigrams which occur less than three times in the corpus:
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

In [None]:
# We may similarly find collocations among tagged words:
nltk.download('universal_tagset')
finder = BigramCollocationFinder.from_words(nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
finder.nbest(bigram_measures.pmi, 5)

In [None]:
# Or tags alone:
finder = BigramCollocationFinder.from_words(t for w, t in nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
finder.nbest(bigram_measures.pmi, 10)

In [None]:
# Or spanning intervening words:
finder = BigramCollocationFinder.from_words(
    nltk.corpus.genesis.words('english-web.txt'),
     window_size = 20)
finder.apply_freq_filter(2)
ignored_words = nltk.corpus.stopwords.words('english')
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
finder.nbest(bigram_measures.likelihood_ratio, 10)

## TFIDF

In [None]:
!wget https://raw.githubusercontent.com/justmarkham/pycon-2019-tutorial/master/ted.csv

In [None]:
import pandas as pd
df = pd.read_csv('./ted.csv')
df.head()

In [None]:
ted = df['description']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted)

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

In [None]:
print("Token's used as Features ")
print(vectorizer.get_feature_names(),"\n")
print("Size of the array")
print(tfidf_matrix.shape,"\n")


In [None]:
ted.iloc[0]


In [None]:
vec1 = vectorizer.transform([ted.iloc[0]])

In [None]:
all_vecs = vectorizer.transform(ted)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
all_scores = [cosine_similarity(vec1, vec) for vec in all_vecs ]

In [None]:
all_scores[0].shape

In [None]:
all_scores_list = [item[0][0] for item in all_scores]
print(all_scores_list)

In [None]:
import numpy as np
ranked = np.argsort(all_scores_list)
print(ranked[-5:])

In [None]:
for idx in ranked[-5:]:
  print(ted.iloc[idx])

In [None]:
# Try different queries or datasets!

# 1.5 Brief Detour: Recent research on Zipf's distribution

[A recent research paper on Zipf's distribution](https://docs.google.com/presentation/d/1HL-TfYil5u6lzoCV0YMJOEggrKuZJdqa68EgvrAKcPE/edit?usp=sharing)

# 1.5 Classifying News Documents into Categories

Based on *Another Excercise: Classifying News Documents in Categories: sport, humor, adventure, science fiction, etc...* in [Natural Language Processing with Python/NLTK by Luciano M. Guasco](https://github.com/luchux/ipython-notebook-nltk/blob/master/NLP%20-%20MelbDjango.ipynb)

## 1. Exploring the `brown` corpus

The Corpus consists of 500 samples, distributed across 15 genres. Each sample began at a random sentence-boundary in the article or other unit chosen, and continued up to the first sentence boundary after 2,000 words.

- **A.** PRESS: Reportage *(44 texts)*
- **B.** PRESS: Editorial *(27 texts)*
- **C.** PRESS: Reviews *(17 texts)*
- **D.** RELIGION *(17 texts)*
- **E.** SKILL AND HOBBIES *(36 texts)*
- **F.** POPULAR LORE *(48 texts)*
- **G.** BELLES-LETTRES - Biography, Memoirs, etc. *(75 texts)*
- **H.** MISCELLANEOUS: US Government & House Organs *(30 texts)*
- **J.** LEARNED - Natural sciences, Medicine, Mathematics, etc. *(80 texts)*
- **K.** FICTION: General *(29 texts)*
- **L.** FICTION: Mystery and Detective Fiction *(24 texts)*
- **M.** FICTION: Science *(6 texts)*
- **N.** FICTION: Adventure and Western *(29 texts)*
- **P.** FICTION: Romance and Love Story *(29 texts)*
- **R.** HUMOR *(9 texts)*

In [None]:
from nltk.corpus import brown
import nltk
nltk.download('brown')
nltk.download('stopwords')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
brown.readme().replace('\n', ' ')

'BROWN CORPUS  A Standard Corpus of Present-Day Edited American English, for use with Digital Computers.  by W. N. Francis and H. Kucera (1964) Department of Linguistics, Brown University Providence, Rhode Island, USA  Revised 1971, Revised and Amplified 1979  http://www.hit.uib.no/icame/brown/bcm.html  Distributed with the permission of the copyright holder, redistribution permitted. '

In [None]:
brown.fileids()

['ca01',
 'ca02',
 'ca03',
 'ca04',
 'ca05',
 'ca06',
 'ca07',
 'ca08',
 'ca09',
 'ca10',
 'ca11',
 'ca12',
 'ca13',
 'ca14',
 'ca15',
 'ca16',
 'ca17',
 'ca18',
 'ca19',
 'ca20',
 'ca21',
 'ca22',
 'ca23',
 'ca24',
 'ca25',
 'ca26',
 'ca27',
 'ca28',
 'ca29',
 'ca30',
 'ca31',
 'ca32',
 'ca33',
 'ca34',
 'ca35',
 'ca36',
 'ca37',
 'ca38',
 'ca39',
 'ca40',
 'ca41',
 'ca42',
 'ca43',
 'ca44',
 'cb01',
 'cb02',
 'cb03',
 'cb04',
 'cb05',
 'cb06',
 'cb07',
 'cb08',
 'cb09',
 'cb10',
 'cb11',
 'cb12',
 'cb13',
 'cb14',
 'cb15',
 'cb16',
 'cb17',
 'cb18',
 'cb19',
 'cb20',
 'cb21',
 'cb22',
 'cb23',
 'cb24',
 'cb25',
 'cb26',
 'cb27',
 'cc01',
 'cc02',
 'cc03',
 'cc04',
 'cc05',
 'cc06',
 'cc07',
 'cc08',
 'cc09',
 'cc10',
 'cc11',
 'cc12',
 'cc13',
 'cc14',
 'cc15',
 'cc16',
 'cc17',
 'cd01',
 'cd02',
 'cd03',
 'cd04',
 'cd05',
 'cd06',
 'cd07',
 'cd08',
 'cd09',
 'cd10',
 'cd11',
 'cd12',
 'cd13',
 'cd14',
 'cd15',
 'cd16',
 'cd17',
 'ce01',
 'ce02',
 'ce03',
 'ce04',
 'ce05',
 'ce06',
 

In [None]:
brown.categories()

In [None]:
brown.sents('ca01')[0]

In [None]:
cfd = nltk.ConditionalFreqDist(
           (genre, word)
           for genre in brown.categories()
           for word in brown.words(categories=genre))

In [None]:
genre_word = [(genre, word) 
              for genre in ['news', 'romance']
              for word in brown.words(categories=genre)] 
len(genre_word)

In [None]:
 genre_word[:10]

In [None]:
genre_word[-10:]

In [None]:
cfd = nltk.ConditionalFreqDist(genre_word)

In [None]:
cfd.conditions()

In [None]:
# Let's access the two conditions, and confirm that each is just a frequency distribution:
print(cfd['news'])

In [None]:
print(cfd['romance'])

In [None]:
cfd['romance'].most_common(20)

In [None]:
cfd['romance']['could']

## 2. Compiling list of most popular words in corpus

In [None]:
from nltk import FreqDist # Takes a bunch of tokens and returns the frequencies of all unique cases.
words_in_corpora = FreqDist(w.lower() for w in brown.words() if w.isalpha()) # Checking if the word is alphabetical avoids including stuff like `` and '' which are actually pretty common. Note that it also omits words such as 1 (very common), aug., 1913, $30, 13th, over-all etc. Another option would have been .isalnum().
words_in_corpora

In [None]:
words_in_corpora_freq_sorted = list(map(list, words_in_corpora.items())) # I use this instead of sorted() because I want to sort my dictionary into a (mutable) list in order to delete the second column as opposed to into a tuple (immutable).
words_in_corpora_freq_sorted

In [None]:
words_in_corpora_freq_sorted.sort(key=lambda x: x[1], reverse=True) # Using a lambda function is an alternative to using the operator library.
words_in_corpora_freq_sorted

In [None]:
best1500 = words_in_corpora_freq_sorted[:1500]

for list_item in best1500:
    del list_item[1]

best1500

In [None]:
# Since best1500 is now a list of words, it should be flattened.
import itertools
chain = itertools.chain(*best1500) # We break down the list into its individual sublists and then chain them. What chain does is that it further breaks down each sublist into its individual components so this approach can be used to flatten any list of lists.
best1500 = list(chain) # chain is of type itertools.chain so we need the cast
best1500

In [None]:
from nltk.corpus import stopwords

stopw = stopwords.words('english')

# Receives a list of words and removes stop words from list
def nonstop(listwords):
    return [word for word in listwords if word not in stopw]

best1500_words_corpora = nonstop(best1500) # Note how this will probably contain less than 1500 words.
best1500_words_corpora

## 3. Converting corpus to form suitable for classification

Each file in the corpus will eventually be represented by a dictionary showing the presence of the corpus’ most popular words in the particular file.

In [None]:
# documents = [(nonstop(brown.words(fileid)), category) for category in brown.categories() for fileid in brown.fileids(category)]
# documents # Note how documents is a list of tuples.

# The code above generates a representation of the corpus but without removing punctuation. This is better:
documents = [([item.lower() for item in nonstop(brown.words(fileid)) if item.isalpha()], category)
             for category in brown.categories()
             for fileid in brown.fileids(category)]
documents # Note how documents is a list of tuples.

In [None]:
from random import shuffle

shuffle(documents)
documents

In [None]:
# Given a document extract features (the presence or not of the 1500 most frequent words of the corpus)
def document_features(doc):
    doc_set_words = set(doc) # Checking whether a word occurs in a set is much faster than checking whether it occurs in a list.
    features_dic = {} # Features is a dictionary
    for word in best1500_words_corpora:
        features_dic['has(%s)' % word] = (word in doc_set_words)
    return features_dic

doc_features_set = [(document_features(d),c) for (d,c) in documents]
doc_features_set

## 4. Building classifier (Naive Bayes)


*   Bayes 사용해서 P(x|c)와 P(c) 만으로 P(c|x)를 계산

*   e.g. P( viagra,free | **Spam** ) 로부터 -> P( **Spam** | viagra,free )

*   *Likelihood*: Spam이메일일 경우, viagra,free 단어가 있을 확률

*   Maximum aposteriori estimation (MAP): 간단히 Posterior probability를 maximize하는 class를 선택 (inference시에)



<img src="https://blog.tenthplanet.in/wp-content/uploads/2019/01/6.png" alt="seq2seq" style="width: 60%"/>

*   **"Naive"** assumption -> P( viagra,free | **Spam** ) 
      = P( viagra | **Spam** ) * P( free | **Spam** )

In [None]:
from nltk import NaiveBayesClassifier
import random

random.seed(42)
random.shuffle(doc_features_set)

train_set = doc_features_set[:350] # Since the total is 500
test_set  = doc_features_set[150:]

classifier = NaiveBayesClassifier.train(train_set)


In [None]:
# Most informative features calculated as:
#  p(has(word)| class1 ) / p(has(word)| class2)
classifier.show_most_informative_features(15)

## 5. Testing classifier

In [None]:
from nltk.classify import accuracy

# Test on the entire test set
print(accuracy(classifier, test_set))

In [None]:
# 'ca01' is under the 'news' category
classifier.classify(document_features(brown.words('ca01')))

In [None]:
from nltk.tokenize import RegexpTokenizer

# The test text needs to be long enough in order to contain a significant amount of the 1500 most common words in our training corpus.
text = "1 God, infinitely perfect and blessed in himself, in a plan of sheer goodness freely created man to make him share in his own blessed life. For this reason, at every time and in every place, God draws close to man. He calls man to seek him, to know him, to love him with all his strength. He calls together all men, scattered and divided by sin, into the unity of his family, the Church. To accomplish this, when the fullness of time had come, God sent his Son as Redeemer and Saviour. In his Son and through him, he invites men to become, in the Holy Spirit, his adopted children and thus heirs of his blessed life. 2 So that this call should resound throughout the world, Christ sent forth the apostles he had chosen, commissioning them to proclaim the gospel: \"Go therefore and make disciples of all nations, baptizing them in the name of the Father and of the Son and of the Holy Spirit, teaching them to observe all that I have commanded you; and lo, I am with you always, to the close of the age.\"4 Strengthened by this mission, the apostles \"went forth and preached everywhere, while the Lord worked with them and confirmed the message by the signs that attended it.\" 3 Those who with God's help have welcomed Christ's call and freely responded to it are urged on by love of Christ to proclaim the Good News everywhere in the world. This treasure, received from the apostles, has been faithfully guarded by their successors. All Christ's faithful are called to hand it on from generation to generation, by professing the faith, by living it in fraternal sharing, and by celebrating it in liturgy and prayer. 4 Quite early on, the name catechesis was given to the totality of the Church's efforts to make disciples, to help men believe that Jesus is the Son of God so that believing they might have life in his name, and to educate and instruct them in this life, thus building up the body of Christ. Catechesis is an education in the faith of children, young people and adults which includes especially the teaching of Christian doctrine imparted, generally speaking, in an organic and systematic way, with a view to initiating the hearers into the fullness of Christian life. While not being formally identified with them, catechesis is built on a certain number of elements of the Church's pastoral mission which have a catechetical aspect, that prepare for catechesis, or spring from it. They are: the initial proclamation of the Gospel or missionary preaching to arouse faith; examination of the reasons for belief; experience of Christian living; celebration of the sacraments; integration into the ecclesial community; and apostolic and missionary witness. Catechesis is intimately bound up with the whole of the Church's life. Not only her geographical extension and numerical increase, but even more her inner growth and correspondence with God's plan depend essentially on catechesis. Periods of renewal in the Church are also intense moments of catechesis. In the great era of the Fathers of the Church, saintly bishops devoted an important part of their ministry to catechesis. St. Cyril of Jerusalem and St. John Chrysostom, St. Ambrose and St. Augustine, and many other Fathers wrote catechetical works that remain models for us. The ministry of catechesis draws ever fresh energy from the councils. the Council of Trent is a noteworthy example of this. It gave catechesis priority in its constitutions and decrees. It lies at the origin of the Roman Catechism, which is also known by the name of that council and which is a work of the first rank as a summary of Christian teaching. The Council of Trent initiated a remarkable organization of the Church's catechesis. Thanks to the work of holy bishops and theologians such as St. Peter Canisius, St. Charles Borromeo, St. Turibius of Mongrovejo or St. Robert Bellarmine, it occasioned the publication of numerous catechisms. It is therefore no surprise that catechesis in the Church has again attracted attention in the wake of the Second Vatican Council, which Pope Paul Vl considered the great catechism of modern times. the General Catechetical Directory (1971) the sessions of the Synod of Bishops devoted to evangelization (1974) and catechesis (1977), the apostolic exhortations Evangelii nuntiandi (1975) and Catechesi tradendae (1979), attest to this. the Extraordinary Synod of Bishops in 1985 asked that a catechism or compendium of all Catholic doctrine regarding both faith and morals be composed. The Holy Father, Pope John Paul II, made the Synod's wish his own, acknowledging that this desire wholly corresponds to a real need of the universal Church and of the particular Churches. He set in motion everything needed to carry out the Synod Fathers' wish."
text = "Wang entered the address for the game into the browser. It had been easy to memorize: www.3body.net. The site indicated that the game only supported access via V-suit. Wang remembered that the employee lounge at the Nanotechnology Research Center had a V-suit. He left the now-empty main lab and went to the security office to get the key. In the lounge, he passed the pool tables and the exercise machines and found the V-suit next to a computer. He struggled into the haptic feedback suit, put on the panoramic viewing helmet, and turned on the computer."

tokenizer = RegexpTokenizer(r'\w+') # Picks out sequences of alphanumeric characters as tokens and drops everything else
text_tokens = nonstop(tokenizer.tokenize(text.lower()))
text_tokens = [w for w in text_tokens if w.isalpha()]
text_tokens

In [None]:
text_features = document_features(text_tokens)
text_features

In [None]:
classifier.classify(document_features(text_tokens))

In [None]:
# A. PRESS: Reportage (44 texts)
# B. PRESS: Editorial (27 texts)
# C. PRESS: Reviews (17 texts)
# D. RELIGION (17 texts)
# E. SKILL AND HOBBIES (36 texts)
# F. POPULAR LORE (48 texts)
# G. BELLES-LETTRES - Biography, Memoirs, etc. (75 texts)
# H. MISCELLANEOUS: US Government & House Organs (30 texts)
# J. LEARNED - Natural sciences, Medicine, Mathematics, etc. (80 texts)
# K. FICTION: General (29 texts)
# L. FICTION: Mystery and Detective Fiction (24 texts)
# M. FICTION: Science (6 texts)
# N. FICTION: Adventure and Western (29 texts)
# P. FICTION: Romance and Love Story (29 texts)
# R. HUMOR (9 texts)

# 1.6 **Bonus: Some more corpora in NLTK**

In [None]:
from nltk.corpus import inaugural
nltk.download('inaugural')
inaugural.fileids()


In [None]:
[fileid[:4] for fileid in inaugural.fileids()]


In [None]:
cfd = nltk.ConditionalFreqDist(
           (target, fileid[:4])
           for fileid in inaugural.fileids()
           for w in inaugural.words(fileid)
           for target in ['america', 'citizen']
           if w.lower().startswith(target)) [1]
cfd.plot()

<img src="https://www.nltk.org/images/inaugural2.png" alt="seq2seq" style="width: 60%"/>

In [None]:
nltk.download('cess_esp')
# corpuses in other languages
nltk.corpus.cess_esp.words()

In [None]:
nltk.download('floresta')
nltk.corpus.floresta.words()

In [None]:
nltk.download('indian')
nltk.corpus.indian.words('hindi.pos')

In [None]:
nltk.download('udhr')
nltk.corpus.udhr.fileids()

In [None]:
nltk.download('udhr')
nltk.corpus.udhr.words('Javanese-Latin1')[11:]

# 2.1 **Deriving N-Grams from Text:** *Creating n-grams (for language classification)*

Based on [N-Gram-Based Text Categorization: Categorizing Text With Python by Alejandro Nolla](http://blog.alejandronolla.com/2013/05/20/n-gram-based-text-categorization-categorizing-text-with-python/)

What are n-grams? See [here](http://cloudmark.github.io/Language-Detection/).

## 1. Tokenization

In [None]:
s = "Le temps est un grand maître, dit-on, le malheur est qu'il tue ses élèves."
s = s.lower()

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[a-zA-Z'`éèî]+")
s_tokenized = tokenizer.tokenize(s)
s_tokenized

In [None]:
from nltk.util import ngrams
generated_4grams = []

for word in s_tokenized:
    generated_4grams.append(list(ngrams(word, 4, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_'))) # n = 4.
generated_4grams

It seems that `generated_4grams` needs flattening since it's supposed to be a list of 4-grams:

In [None]:
generated_4grams = [word for sublist in generated_4grams for word in sublist]
generated_4grams[:10]

## 2. Obtaining n-grams (n = 4)

In [None]:
ng_list_4grams = generated_4grams
for idx, val in enumerate(generated_4grams):
    ng_list_4grams[idx] = ''.join(val)
ng_list_4grams

## 3. Sorting n-grams by frequency (n = 4)

In [None]:
freq_4grams = {}

for ngram in ng_list_4grams:
    if ngram not in freq_4grams:
        freq_4grams.update({ngram: 1})
    else:
        ngram_occurrences = freq_4grams[ngram]
        freq_4grams.update({ngram: ngram_occurrences + 1})
        
from operator import itemgetter # The operator module exports a set of efficient functions corresponding to the intrinsic operators of Python. For example, operator.add(x, y) is equivalent to the expression x + y.

freq_4grams_sorted = sorted(freq_4grams.items(), key=itemgetter(1), reverse=True)[0:300] # We only keep the 300 most popular n-grams. This was suggested in the original paper written about n-grams.
freq_4grams_sorted

## 4. Obtaining n-grams for multiple values of n

To get n-grams for n = 1, 2, 3 and 4 we can use:

In [None]:
from nltk import everygrams

s_clean = ' '.join(s_tokenized) # For the code below we need the raw sentence as opposed to the tokens.
s_clean

In [None]:
def ngram_extractor(sent):
    return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
            if ' ' not in ng and '\n' not in ng and ng != ('_',)]

ngram_extractor(s_clean)

# 2.2 **Detecting Text Language by Counting Stop Words:** *A simple way to find out what language a text is written in*

Based on [Detecting Text Language With Python and NLTK by Alejandro Nolla](http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/)

*Stop words* are words which are filtered out before processing because they are mostly grammatical as opposed to semantic in nature e.g. search engines remove words like 'want'.

## 1. Tokenizing

In [None]:
text = "Yo man, it's time for you to shut yo' mouth! I ain't even messin' dawg."

In [None]:
import sys

try:
    from nltk.tokenize import wordpunct_tokenize # RE-based tokenizer which splits text on whitespace and punctuation (except for underscore)
except ImportError:
    print('[!] You need to install nltk (http://nltk.org/index.html)')

In [None]:
test_tokens = wordpunct_tokenize(text)
test_tokens

There are other tokenizers e.g. `RegexpTokenizer` where you can enter your own regexp, `WhitespaceTokenizer` (similar to Python's `string.split()`) and `BlanklineTokenizer`.

## 2. Exploring NLTK's stop words corpus

NLTK comes with a corpus of stop words in various languages.

In [None]:
from nltk.corpus import stopwords
stopwords.readme().replace('\n', ' ') # Since this is raw text, we need to replace \n's with spaces for it to be readable.

In [None]:
stopwords.fileids() # Most corpora consist of a set of files, each containing a piece of text. A list of identifiers for these files is accessed via fileids().

Corpus readers provide a variety of methods to read data from the corpus:

In [None]:
stopwords.raw('greek')

In [None]:
stopwords.raw('greek').replace('\n', ' ') # Better

In [None]:
stopwords.words('english')[:10]

We can also use `.sents()` which returns sentences. However, in our particular case, this will cause an error:

In [None]:
#stopwords.sents('greek')

The erro is because the `stopwords` corpus reader is of type `WordListCorpusReader` so there are no sentences.
It's the same for `.paras()`.

In [None]:
len(stopwords.words(['english', 'greek'])) # There is a total of 444 Greek and English stop words

## 3. The classification

We loop through the list of stop words in all languages and check how many stop words our test text contains in each language. The text is then classified to be in the language in which it has the most stop words.

In [None]:
language_ratios = {}

test_words = [word.lower() for word in test_tokens] # lowercase all tokens
test_words_set = set(test_words)

for language in stopwords.fileids():
    stopwords_set = set(stopwords.words(language)) # For some languages eg. Russian, it would be a wise idea to tokenize the stop words by punctuation too.
    common_elements = test_words_set.intersection(stopwords_set)
    language_ratios[language] = len(common_elements) # language "score"
    
language_ratios

In [None]:
most_rated_language = max(language_ratios, key=language_ratios.get) # The key parameter to the max() function is a function that computes a key. In our case, we already have a key so we set key to languages_ratios.get which actually returns the key.
most_rated_language

In [None]:
test_words_set.intersection(set(stopwords.words(most_rated_language))) # We can see which English stop words were found.

## 4. Plotting and Tabulating Distributions

In [None]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
          (lang, len(word))
           for lang in languages
           for word in udhr.words(lang + '-Latin1'))

In [None]:
cfd.tabulate(conditions=['English', 'German_Deutsch'],  samples=range(10), cumulative=True)

# 2.3 **Language Identifier Using Word Bigrams:** *State-of-the-art language classifier*

Based on [Language Identifier by asif31iqbal](https://github.com/asif31iqbal/language-identifier)

## 0. Importing libraries and creating helper tokenize method

In [None]:
%cd /content
!git clone https://github.com/asif31iqbal/language-identifier


In [None]:
import pickle
import string
import os
from nltk import ngrams, FreqDist, word_tokenize
from numpy import arange
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def ultimate_tokenize(sentence):
    # Remove punctuation and digits
    sentence = sentence.translate(str.maketrans('', '', string.punctuation + string.digits))
    return word_tokenize(sentence.lower())

## 1. Understanding the process

In [None]:
simple_example_text = 'Oh, then, I see Queen Mab hath been with you.'

simple_example_tokens_words = ultimate_tokenize(simple_example_text)
simple_example_tokens_words

In [None]:
simple_example_tokens_chars = list(simple_example_tokens_words[0])
simple_example_tokens_chars

In [None]:
simple_example_tokens_words_unigrams = list(ngrams(simple_example_tokens_words, 1))
simple_example_tokens_words_unigrams

In [None]:
simple_example_tokens_words_bigrams = list(ngrams(simple_example_tokens_words, 2, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_'))
simple_example_tokens_words_bigrams

In [None]:
fdist = FreqDist(simple_example_tokens_words_unigrams)
fdist

In [None]:
unigram_dict = dict()
for k, v in fdist.items():
        unigram_dict[' '.join(k)] = v
unigram_dict

In [None]:
file = '/content/language-identifier/hm3_files/LangId.train.English'
with open(file, encoding='utf8') as f:
        content = f.read().lower()
content.replace('\n', '')[:100]

In [None]:
with open('/content/language-identifier/hm3_files/English.unigram.pickle', 'rb') as handle:
    unigram_english_dict = pickle.load(handle)
unigram_english_dict

In [None]:
with open('/content/language-identifier/hm3_files/English.bigram.pickle', 'rb') as handle:
    bigram_english_dict = pickle.load(handle)
bigram_english_dict

In [None]:
bigram_english_dict.get('of the')

In [None]:
import operator
english_unigram_freqs = sorted(unigram_english_dict.items(), key=operator.itemgetter(1), reverse=True)
english_unigram_freqs[:10]

In [None]:
labels, values = zip(*english_unigram_freqs[:10])
indexes = arange(len(labels))
width = 0.8 # width = 1 would give bars that overlap because they are too close.

fig = plt.figure(figsize=(10,7))                                                               
ax = fig.gca() # Get current axis
rects = ax.bar(indexes, values, width)

# Add title and axis labels
fig.suptitle('Top 10 English word unigrams', fontsize=20)
plt.xlabel('Word unigram', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

# Display value of each bar on bar
for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2., 50 + height, '%d' % int(height), ha='center', va='bottom') # Can also add color and fontweight arguments.

# Remove the default x-axis tick numbers and use tick numbers of your own choosing:
ax.set_xticks(indexes)
# Replace the tick numbers with strings:
ax.set_xticklabels(labels)

plt.show()
# plt.savefig('top10EnglishWordUnigrams.png')

## 1. Generating unigram and bigram frequencies for English, French and Italian from training files

In [None]:
def get_ngram_count_dict(tokens, n):
    if n == 1:
        n_grams = ngrams(tokens, n)
    else:
        n_grams = ngrams(tokens, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_') # Fun fact: If I remove padding here and later when testing, and also remove the '_' from the unigram dicts, the accuracy rises slightly. However, it's not statistically significant due to the small size of the data.
    fdist = FreqDist(n_grams)
    ngram_dict = dict()
    for k,v in fdist.items():
        ngram_dict[' '.join(k)] = v
    return ngram_dict

# Calls get_ngram_count_dict to get a unigram and bigram dict from file.
def get_unigram_bigram_dicts(file):
    with open(file, encoding='utf8') as f:
        content = f.read()
    tokens = ultimate_tokenize(content)
    unigram_dict = get_ngram_count_dict(tokens, 1)     
    bigram_dict = get_ngram_count_dict(tokens, 2)     
    return (unigram_dict, bigram_dict)

# Dumps unigram and bigram dictionary of training data of given language to .pickle files.
def dump_pickle(language):
    file = '/content/language-identifier/hm3_files/LangId.train.' + language #+ '.txt'
    unigram_dict, bigram_dict = get_unigram_bigram_dicts(file)
    with open('/content/language-identifier/hm3_files/' + language + '.unigram.pickle', 'wb') as handle:
        pickle.dump(unigram_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) # HIGHEST_PROTOCOL instructs pickle to use the highest protocol version available.
    with open('/content/language-identifier/hm3_files/' + language + '.bigram.pickle', 'wb') as handle:
        pickle.dump(bigram_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
dump_pickle('English')
dump_pickle('French')
dump_pickle('Italian')

Later, it will also be required to know how many sentences there are in the training data for each language. This is because of the method used to calculate probabilities (incorporating the probability of the bigram among other bigrams starting with the same word) and the fact we use padding for our bigrams. 

In our training data each line is a sentence, which is very convenient for calculating the number of sentences.

We go ahead and get the number of sentences (for more efficiency, the following code could be added to `get_unigram_bigram_dicts`):

In [None]:
with open('/content/language-identifier/hm3_files/LangId.train.English', encoding='utf8') as f:
    for i, l in enumerate(f):
        pass
number_of_sents_en = i + 1
with open('/content/language-identifier/hm3_files/LangId.train.French', encoding='utf8') as f:
    for i, l in enumerate(f):
        pass
number_of_sents_fr = i + 1
with open('/content/language-identifier/hm3_files/LangId.train.Italian', encoding='utf8') as f:
    for i, l in enumerate(f):
        pass
number_of_sents_it = i + 1

print('NUMBER OF SENTENCES IN TRAINING DATA')
print('English:', number_of_sents_en)
print('French:', number_of_sents_fr)
print('Italian:', number_of_sents_it)

## 2. Identifying language for each line of the test file using bigram probabilities

In [None]:
with open('/content/language-identifier/hm3_files/English.unigram.pickle', 'rb') as handle:
    unigram_english_dict = pickle.load(handle)
    
with open('/content/language-identifier/hm3_files/English.bigram.pickle', 'rb') as handle:
    bigram_english_dict = pickle.load(handle)
    
with open('/content/language-identifier/hm3_files/French.unigram.pickle', 'rb') as handle:
    unigram_french_dict = pickle.load(handle)
    
with open('/content/language-identifier/hm3_files/French.bigram.pickle', 'rb') as handle:
    bigram_french_dict = pickle.load(handle)
    
with open('/content/language-identifier/hm3_files/Italian.unigram.pickle', 'rb') as handle:
    unigram_italian_dict = pickle.load(handle)
    
with open('/content/language-identifier/hm3_files/Italian.bigram.pickle', 'rb') as handle:
    bigram_italian_dict = pickle.load(handle)
    
vocabulary_size = len(unigram_english_dict) + len(unigram_french_dict) + len(unigram_italian_dict)
vocabulary_size

In [None]:
# Get probability of given bigram belonging to the language which bigram_dict is in
def get_bigram_probability(bigram, first_word, bigram_dict, first_word_dict): # first_word is the first word of the word bigram.
    bigram_count = bigram_dict.get(bigram)
    if bigram_count is None:
        bigram_count = 0
    
    first_word_count = first_word_dict.get(first_word)
    if first_word_count is None:
        first_word_count = 0
    
    return (bigram_count + 1) / (first_word_count + vocabulary_size) # To get the logic of this formula, note how the proability is used in the function below. Without the + 1 in the Nr, if you find a bigram which is not in our known bigrams for a language, the probability of it being in that language would become 0. So we would like to assign a small probability of 1 / vocabulary_size in that case. Also note the arbitrariness of this 'probability'. We're saying "Given a bigram and a language, what is the probability that the bigram is of that language?" This is arbitrary because to get a meaningful probability we need to know which are the other languages considered and what their bigram frequencies are. That would be another way to do it, but arguable a worse one because it wouldn't be able to give a confidence score for a particular language. The formula just uses common sense to get to a number which works for the purposes. In the denominator, we have both first_word_count and vocabulary_size. Why? We have vocabulary_size for all langs in the denom because the larger this is, the less significant it is that for this particular language the bigram appears so many times. Could we have used a vocab_size of bigrams instead of unigrams? Sure, and the 'probabilities' would end up being much smaller numbers. What about first_word_count? This gives us a way to compare this bigram against other bigrams in this language starting with the same word. In general though, for a given bigram, it's more important to consider how many times it exists than to consider whether it is the usual bigram given a certain first word. The formula achieves that. Take the bigram 'le monseiur' and the English language. Let's say the bigram appears once and 'le' also appears once, while in French 'le monseiur' appears 100 times and le appears 100,000 times. Probability for English = (1 + 1) / (1 + 20,000) = 0.000099995. Probability for French = (100 + 1) / (100,000 + 20,000) = 0.00084166666. Note how the probability for French is still low because 100/100,000 is quite low and maybe it's not French after all if in French le is usually followed by other words. However, it's still significantly higher than the probability for English where both 'le' and 'le monseiur' only appear once.

# Get probability that a given bigram list is of a language (specified by its bigram_dict)
def get_language_probability(bigram_list, first_words, bigram_dict, first_word_dict):
    result = 1.0
    index = 0
    for bigram in bigram_list:
        result *= get_bigram_probability(bigram, first_words[index], bigram_dict, first_word_dict)
        index += 1
    return result

# Load correct solutions
solution_dict = dict()
with open('/content/language-identifier/hm3_files/LangId.sol') as f:
    for line in f:
       (key, val) = line.split()
       solution_dict[int(key)] = val
        
line_no = 1
result_dict = dict()
correct = 0
incorrect_line_numbers = []

# This needs to be done because I'm using padding for bigrams so the unigram dicts in their raw forms can't be used in get_bigram_probability():
unigram_english_dict['_'] = number_of_sents_en
unigram_french_dict['_'] = number_of_sents_fr
unigram_italian_dict['_'] = number_of_sents_it

with open('/content/language-identifier/hm3_files/LangId.test', encoding='utf8') as f:
    for line in f:
        tokens = ultimate_tokenize(line)
        bigrams = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')
        bigram_list = [] # bigram_list will be exactly like bigrams but instead of [('_', 'this'), ...] it will be ['_ this', ...]. It is required because this is how bigrams are represented in the dictionary.
        first_words = [] # The first words of each bigram. This is the similar to making a unigram_list. We use it because we don't want something in the form [(this,), ...]. Also because we want this to include '_'. We want it to include '_' because we're not using the unigrams for classification but as part of a formula to judge bigram frequency based on the starting word.
        for b in bigrams:
            bigram_list.append(' '.join(b))
            first_words.append(b[0])
        
        english_prob = get_language_probability(bigram_list, first_words, bigram_english_dict, unigram_english_dict)
        french_prob = get_language_probability(bigram_list, first_words, bigram_french_dict, unigram_french_dict)
        italian_prob = get_language_probability(bigram_list, first_words, bigram_italian_dict, unigram_italian_dict)
        
        max_prob = max(english_prob, french_prob, italian_prob)
        if max_prob == english_prob:
            result_dict[line_no] = 'English'
        elif max_prob == french_prob:
            result_dict[line_no] = 'French'
        else:
            result_dict[line_no] = 'Italian'
        
        if solution_dict[line_no] == result_dict[line_no]:
            correct += 1
        else:
            incorrect_line_numbers.append(line_no)
            
        line_no += 1

# Storing results from result_dict to file:
with open('/content/language-identifier/hm3_files/LangId.result', 'w') as f:
    for (key, val) in result_dict.items():
        f.write(' '.join([str(key), val]) + '\n')
        
print('Accuracy: {:2.2f}%'.format(correct * 100 / len(solution_dict)))

In [None]:
print('Line numbers for incorrectly classified languages: {}'.format(str(incorrect_line_numbers)))

## 3. Testing with our own sentence

In [None]:
sent = "This is a sentence."
sent_tokens = ultimate_tokenize(sent)
sent_bigrams_pre = ngrams(sent_tokens, 2, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')
sent_bigrams = []
sent_bigrams_first_words = []
for b in sent_bigrams_pre:
    sent_bigrams.append(' '.join(b))
    sent_bigrams_first_words.append(b[0])
print('Sentence bigrams:', sent_bigrams)
print('Sentence bigrams first words:', sent_bigrams_first_words)

In [None]:
sent_english_prob = get_language_probability(sent_bigrams, sent_bigrams_first_words, bigram_english_dict, unigram_english_dict)
sent_french_prob = get_language_probability(sent_bigrams, sent_bigrams_first_words, bigram_french_dict, unigram_french_dict)
sent_italian_prob = get_language_probability(sent_bigrams, sent_bigrams_first_words, bigram_italian_dict, unigram_italian_dict)
print("RAW 'PROBABILITIES'")
print('English:', sent_english_prob)
print('French:', sent_french_prob)
print('Italian:', sent_italian_prob)
# As we can see, these 'probabilities' are arbitrary. We can try to convert them to percentages since we are classifying only among these 3 languages:

In [None]:
def get_normalized_probabilities(list_of_probabilities):
    sum_of_probabilities = sum(list_of_probabilities)
    result = []
    for probability in list_of_probabilities:
        result.append(probability / sum_of_probabilities)
    return result

probabilities = [sent_english_prob, sent_french_prob, sent_italian_prob]
normalized_probabilities = get_normalized_probabilities(probabilities)

print('RELATIVE PROBABILITIES')
print('English: ', round(normalized_probabilities[0] * 100, 2), '%', sep='') # I use sep because I don't want a space before the % sign.
print('French: ', round(normalized_probabilities[1] * 100, 2), '%', sep='')
print('Italian: ', round(normalized_probabilities[2] * 100, 2), '%', sep='')

**PS:** For a state-of-the-art Greek dialect classifier using n-grams, take a look at [Greek Dialect Classifier](https://github.com/hb20007/greek-dialect-classifier).

In [None]:
from nltk.corpus import inaugural
nltk.download('inaugural')
inaugural.fileids()


In [None]:
[fileid[:4] for fileid in inaugural.fileids()]


In [None]:
cfd = nltk.ConditionalFreqDist(
           (target, fileid[:4])
           for fileid in inaugural.fileids()
           for w in inaugural.words(fileid)
           for target in ['america', 'citizen']
           if w.lower().startswith(target)) [1]
cfd.plot()

In [None]:
nltk.download('cess_esp')
# corpuses in other languages
nltk.corpus.cess_esp.words()

In [None]:
nltk.download('floresta')
nltk.corpus.floresta.words()

In [None]:
nltk.download('indian')
nltk.corpus.indian.words('hindi.pos')

In [None]:
nltk.download('udhr')
nltk.corpus.udhr.fileids()

In [None]:
nltk.download('udhr')
nltk.corpus.udhr.words('Javanese-Latin1')[11:]

# 3.1 **Bigrams, Stemming and Lemmatizing:** *NLTK makes bigrams, stemming and lemmatization super-easy*

## 1. Exploring the `reuters` corpus

In [None]:
from nltk.corpus import reuters

reuters.readme().replace('\n', ' ')

In [None]:
reuters.fileids()

In [None]:
reuters.fileids()[-1]

In [None]:
len(reuters.fileids())

In [None]:
reuters.fileids('barley')

In [None]:
reuters.fileids(['barley', 'corn'])

In [None]:
reuters.categories()

In [None]:
reuters.categories(['training/9865', 'training/9880'])

In [None]:
reuters.sents('test/14826')

In [None]:
reuters.words(categories='barley')

In [None]:
reuters.words(categories=['barley', 'corn'])

## 2. Bigrams

In [None]:
trade_words = reuters.words(categories='trade')
len(trade_words)

In [None]:
trade_words_condensed = trade_words[:100]
trade_words_condensed

In [None]:
from nltk.corpus import stopwords

# Remove stopwords from trade_words_condensed and lower case it
trade_words_condensed = [w.lower() for w in trade_words_condensed if w.lower() not in stopwords.words('english')]
trade_words_condensed[:10]

In [None]:
import string # Contains string constants eg. ascii_lowercase which is 'a...z', string formatting functions, other string functions like .capwords() and .translate().

# Remove punctuation
# trade_words_condensed = [w for w in trade_words_condensed if w not in string.punctuation]
punct_combo = [c + "\"" for c in string.punctuation ] + ["\"" + c for c in string.punctuation] + [".-", ":-", "..", "..."]
trade_words_condensed = [w for w in trade_words_condensed if w not in string.punctuation and w not in punct_combo]
trade_words_condensed

In [None]:
from nltk import bigrams

bi_trade_words_condensed = list(bigrams(trade_words_condensed))
bi_trade_words_condensed[:5]

In [None]:
from nltk import FreqDist

bi_fdist = FreqDist(bi_trade_words_condensed)

for word, frequency in bi_fdist.most_common(3):
    print(word, frequency)

In [None]:
bi_fdist.plot(3, cumulative=False)

## 3. Stemming

In [None]:
from nltk.stem import (PorterStemmer, LancasterStemmer)
from nltk.stem.snowball import SnowballStemmer # This is "Porter 2" and is considered the optimal stemmer.

porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

print(porter.stem('Re-testing'), lancaster.stem('Re-testing'), snowball.stem('Re-testing'))

In [None]:
# Fun fact: SnowballStemmer can stem several other languages beside English.
# To make, for instance, a French stemmer, we can do the following: french_stemmer = SnowballStemmer('french')
SnowballStemmer.languages

In [None]:
from nltk import word_tokenize

sentence = "So, we'll go no more a-roving. So late into the night, Though the heart be still as loving, And the moon be still as bright."

# This uses the 3-argument version of str.maketrans with arguments (x, y, z) where 'x' and 'y' must be equal-length strings and characters in 'x' are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character in the string is mapped to None
translator = str.maketrans('', '', string.punctuation)
translator

# This is an alternative that creates a dictionary mapping of every character from string.punctuation to None (this will also work but creates a whole dictionary so is slower)
#translator = str.maketrans(dict.fromkeys(string.punctuation))

In [None]:
tokens = word_tokenize(sentence.translate(translator))
tokens[:3]

In [None]:
for stemmer in [porter, lancaster, snowball]:
    print([stemmer.stem(t) for t in tokens])

## 4. Lemmatizing

Lemmatization aims to achieve a similar base "stem" for a word, but aims to derive the genuine dictionary root word, not just a trunctated version of the word.

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
# The default lemmatization method with the Python NLTK is the WordNet lemmatizer.
from nltk import WordNetLemmatizer

wnl = WordNetLemmatizer()

print(wnl.lemmatize('brightening'), wnl.lemmatize('boxes'))

In [None]:
# As we saw above, sometimes, if we try to lemmatize a word, it will end up with the same word. This is because the default part of speech is nouns.
wnl.lemmatize('brightening', pos='v')

## 5. Generating Random Text with Bigrams

In [None]:
# We can use a conditional frequency distribution to create a table of bigrams (word pairs).
# The bigrams() function takes a list of words and builds a list of consecutive word pairs. 
# Remember that, in order to see the result and not a cryptic "generator object", we need to use the list() function:

In [None]:
text = nltk.corpus.genesis.words('english-kjv.txt')


In [None]:
text

In [None]:
bigrams = nltk.bigrams(text)


In [None]:
bigrams_disp = list( nltk.bigrams(text))
print(bigrams_disp[:10])

In [None]:
cfd = nltk.ConditionalFreqDist(bigrams)

In [None]:
cfd

In [None]:
nltk.download('genesis')

# constructs a conditional frequency distribution to record which words are most likely to follow a given word
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()



In [None]:
cfd['living']

In [None]:
generate_model(cfd, 'creature')

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O shakespeare.txt
shakespeare = open("shakespeare.txt").readlines()

In [None]:
nltk.download('punkt')
sbigrams = nltk.bigrams(nltk.word_tokenize(' '.join(shakespeare)))

In [None]:
scfd = nltk.ConditionalFreqDist(sbigrams)

In [None]:
scfd['thou']

In [None]:
generate_model(scfd, 'terrible')

In [None]:
"""
Can you make the generator more diverse?
"""

In [None]:
better_generator('thou', scfd, 20)

In [None]:
scfd['?']

# 3.2 **Finding Unusual Words in Given Language:** *Which words do not belong with the rest of the text?*

In [None]:
text = "Truly Kryptic is the best puzzle game. It's browser-based and free. Google it."

## 1. Tokenizing text

In [None]:
from nltk import word_tokenize
text_tokenized = word_tokenize(text.lower())
text_tokenized

## 2. Importing and exploring the words corpus

In [None]:
from nltk.corpus import words
nltk.download('words')
words.readme().replace('\n', ' ')

In [None]:
words

In [None]:
words.fileids()

In [None]:
words.words('en')[:10]

In [None]:
words.words('en-basic')[:10]

In [None]:
len(words.words('en'))

In [None]:
len(words.words('en-basic'))

## 3. Finding unusual words

In [None]:
english_vocab = set(w.lower() for w in words.words())
text_vocab = set(w.lower() for w in text_tokenized if w.isalpha()) # Note .isalpha() removes punctuation tokens. However, tokens with a hyphen like 'browser-based' are totally skipped over because .isalpha() would be false.
unusual = text_vocab.difference(english_vocab)
unusual

We can train a classifier to work out which suffixes are most informative for POS tagging. We can begin by finding out what the most common suffixes are

In [None]:
from nltk.corpus import brown
from nltk import FreqDist

suffix_fdist = FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
    
suffix_fdist

In [None]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes[:10]

Next, we'll define a feature extractor function which checks a given word for these suffixes:

In [None]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

pos_features('test')

Now that we've defined our feature extractor, we can use it to train a new decision tree classifier:

In [None]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
featuresets[0]

**Decision trees**


*   **Node**: Represents each feature

*   **Leaf nodes**: Represents labels

*   **Branches**: represent conjunctions of features that lead to those class



<img src="https://forum.huawei.com/enterprise/en/data/attachment/forum/202103/24/190400o09x7rhnnhy2yon7.png?1.png" alt="seq2seq" style="width: 60%"/>

In [None]:
from nltk import DecisionTreeClassifier
from nltk.classify import accuracy

cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]

In [None]:
#classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers.

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
#bayes_classifier = SklearnClassifier(BernoulliNB(), sparse=False).train(train_set)
#svm_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
classifier = SklearnClassifier(DecisionTreeClassifier(), sparse=False).train(train_set)
#print(accuracy(classifier, test_set) , accuracy(bayes_classifier, test_set))
#print(accuracy(classifier, test_set) , accuracy(svm_classifier, test_set), accuracy(bayes_classifier, test_set))

In [None]:
accuracy(classifier, test_set)

In [None]:
classifier.classify(pos_features('cats'))

In [None]:
classifier.classify(pos_features('cat'))

In [None]:
len(train_set)

In [None]:
# Use nltk decision tree
# We should reduce the training data (90499 takes too long with nltk's implementation of decision tree)
nltk_dt = nltk.DecisionTreeClassifier.train(train_set[:10000])

In [None]:
accuracy(nltk_dt, test_set)

In [None]:
nltk_dt.classify(pos_features('cats'))

In [None]:
classifier.classify(pos_features('cat'))

In [None]:
print(nltk_dt.pseudocode(depth=5))

In [None]:
english_vocab = set(w.lower() for w in words.words())
text_vocab = set(w.lower() for w in text_tokenized if w.isalpha()) # Note .isalpha() removes punctuation tokens. However, tokens with a hyphen like 'browser-based' are totally skipped over because .isalpha() would be false.
unusual = text_vocab.difference(english_vocab)
unusual