In [1]:
import nltk

In [2]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [3]:
engstopwords = set(stopwords.words('english'))

In [4]:
print(engstopwords)

{'very', "shan't", 'am', 'i', 'up', "isn't", 'himself', 'they', 'this', 'why', 'own', 'hadn', 'should', "mustn't", 'above', 'such', "that'll", 'ain', 'can', "hadn't", 'your', "you've", 'because', "you're", 'had', 'with', 'where', 'no', "aren't", 'while', 'too', 'yourselves', 'their', 'ourselves', 'shan', 'a', 'during', 'yours', 'does', 'over', 'below', 'doesn', 'what', 'then', 'before', 'themselves', 'herself', 'so', 'when', "you'd", 'who', 'o', 'by', "should've", 'to', 'as', 'down', 'on', "she's", "wouldn't", 'in', 'hasn', 'nor', 'his', 'just', 'how', 'm', 'are', 'we', 'wasn', 'my', 'its', 'doing', 'is', 'each', 'and', 'again', 't', 'off', 'other', 'our', 'shouldn', 'was', 'those', 'here', "it's", 'both', "wasn't", 'any', 'been', 'weren', 'not', 'mustn', 'him', 'most', 'few', 'into', 's', 'he', 'be', "doesn't", 'now', 'me', "didn't", 'at', 'have', 'some', 'couldn', "mightn't", 'yourself', 'about', 'has', 'after', 'same', 'you', 'did', "weren't", 'ma', 'all', 'don', 'ours', 'that', 'wo

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
## Fucntion to conver pos_tag part of speech tags to WordNet tags for use with lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None 

In [7]:
# It takes raw text, tokenizes it. Removes stopwords and words that are one character long( like 'a' and punctuation.) Returns a list of tokens.
def tokenize(text):
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [word.lower() for word in tokens if len(word) > 1] 
    tokens = [word for word in tokens if word not in engstopwords] # remove stop words
    return tokens

In [8]:
# takes token list and gets the part of speech for each. Returns a list of tuples ('word', 'POS')
def get_pos(tokens):
    tagged = nltk.pos_tag(tokens)
    return tagged

Lemmatizes words. That is, it normalizes words to their most
basic form. For example, 'is', 'am' and 'are' are merged into 'be'.
Returns a list of lemmatized tokens.

Lemmatization is confined to parts of speech. Verb variations are lemmatized
to the root verb, and same with nouns.

Example: 'continues' and 'continuing' are merged to 'continue'
but 'continuation' stays the same.

In [10]:
def lemmatize(tagged_words):
    lemma_list = []
    for word, tag in tagged_words:
        wntag = get_wordnet_pos(tag)
        if wntag is None:
            lemma = lemmatizer.lemmatize(word) 
        else:
            lemma = lemmatizer.lemmatize(word, pos=wntag)
        lemma_list.append(lemma)
    return lemma_list

In [9]:
#Finds the most common words in a token list. Returns a list of tuples ('word', frequency)
def get_distfreq(tokens, top_n): 
    fdist = nltk.FreqDist(tokens)
    return fdist.most_common(top_n)

In [13]:
with open('A:\\Data Analysis Jupyter\\ML-Code\\NLP Concepts\\Tokenizing\\basictext.txt', encoding='utf-8') as f1:
    coderre = f1.read()

In [14]:
# Tokenize Coderre text and print first 40 tokens

coderre_tokens = tokenize(coderre)
coderre_tokens[:40]

['\ufeffcontinuing',
 'progress',
 'together',
 'strong',
 'plan',
 'sustainable',
 'city',
 'electoral',
 'platform',
 '2017-2021',
 'dear',
 'friends',
 'last',
 'four',
 'years',
 'proven',
 'anything',
 'working',
 'together',
 'nothing',
 'impossible',
 'since',
 'arrival',
 'city',
 'hall',
 'many',
 'achievements',
 'result',
 'strong',
 'commitment',
 'elected',
 'officials',
 'teamed',
 'community',
 'leaders',
 'montréal',
 'civil',
 'society',
 'everyone',
 'agreed']

In [15]:
# Get the part of speech for each token

coderre_pos = get_pos(coderre_tokens)
coderre_pos[:20]

[('\ufeffcontinuing', 'VBG'),
 ('progress', 'NN'),
 ('together', 'RB'),
 ('strong', 'JJ'),
 ('plan', 'NN'),
 ('sustainable', 'JJ'),
 ('city', 'NN'),
 ('electoral', 'JJ'),
 ('platform', 'NN'),
 ('2017-2021', 'JJ'),
 ('dear', 'JJ'),
 ('friends', 'NNS'),
 ('last', 'JJ'),
 ('four', 'CD'),
 ('years', 'NNS'),
 ('proven', 'RB'),
 ('anything', 'NN'),
 ('working', 'VBG'),
 ('together', 'RB'),
 ('nothing', 'NN')]

In [16]:
# Lemmatize the tokens using the wordnet POS converter function and get 10 most common words

coderre_lemma = lemmatize(coderre_pos)
get_distfreq(coderre_lemma, 10)

[('montréal', 99),
 ('city', 82),
 ('development', 36),
 ('new', 34),
 ('continue', 31),
 ('project', 28),
 ('develop', 28),
 ('work', 27),
 ('social', 26),
 ('public', 26)]