# Imports

In [1]:
# Import classic and useful libraries
import pandas as pd 
import numpy as np 
import seaborn 
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from string import punctuation
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Some text

Define a list of words in order to observe the effects of the methods and treatments used.

In [2]:
my_list = ["cat","cats","lie","lying","run","running","city","cities","month","monthly","woman","women", 'better', "are"]

In [3]:
# Some text
sentenceA = 'A stemmer for English operating on the stem cat should identify such strings as cats, catlike, and catty.'
sentenceB= 'A stemming algorithm might also reduce the words fishing, fished and fisher to the stem fish.'
sentenceC = 'Stemming algorithms are very useful in NLP'
sentenceD = "But it will not help you to catch cats or fishes"

# Regroup sentences into a list 
sentences = [sentenceA, sentenceB, sentenceC, sentenceD]

# Introduction to word frequencies

In [5]:
# From 1st part of hands-on. Try to write functions when you write re-usable code.
def basic_cleaning(corpus):
    """ Apply basic cleaning to a text corpus
    """
    
    corpus = [review.lower() for review in corpus]

    token_corpus = [nltk.word_tokenize(review) for review in corpus]

    characters_to_remove = ["@", "/", "#", ".", ",", "!", "?", "(", ")", "-", "_", "’", "'",
                            "\"", ":", "'", "$", "%", '&', '£']
    stop_words_en = nltk.corpus.stopwords.words("english")
    my_stop_words = []
    
    all_stop_char = stop_words_en + characters_to_remove + my_stop_words
    corpus = [[token for token in review if token not in all_stop_char]
              for review in token_corpus]

    return corpus, all_stop_char

In [6]:
sentences_clean, _ = basic_cleaning(sentences)
sentences_clean

[['stemmer',
  'english',
  'operating',
  'stem',
  'cat',
  'identify',
  'strings',
  'cats',
  'catlike',
  'catty'],
 ['stemming',
  'algorithm',
  'might',
  'also',
  'reduce',
  'words',
  'fishing',
  'fished',
  'fisher',
  'stem',
  'fish'],
 ['stemming', 'algorithms', 'useful', 'nlp'],
 ['help', 'catch', 'cats', 'fishes']]

In [7]:
print(sentences[1])
print(sentences_clean[1])

A stemming algorithm might also reduce the words fishing, fished and fisher to the stem fish.
['stemming', 'algorithm', 'might', 'also', 'reduce', 'words', 'fishing', 'fished', 'fisher', 'stem', 'fish']


In [9]:
# Let's see what are all the words used in our sentences
vocab = list(set([item for sentence in sentences_clean for item in sentence]))
vocab

['strings',
 'catch',
 'stemming',
 'catlike',
 'algorithms',
 'words',
 'help',
 'operating',
 'stem',
 'english',
 'fishing',
 'stemmer',
 'cat',
 'nlp',
 'fisher',
 'fish',
 'algorithm',
 'also',
 'might',
 'reduce',
 'cats',
 'fishes',
 'identify',
 'fished',
 'useful',
 'catty']

In [10]:
# Initialize the vocabulary frequence to zero
vocab_freq = {word:0 for word in vocab}
vocab_freq

{'strings': 0,
 'catch': 0,
 'stemming': 0,
 'catlike': 0,
 'algorithms': 0,
 'words': 0,
 'help': 0,
 'operating': 0,
 'stem': 0,
 'english': 0,
 'fishing': 0,
 'stemmer': 0,
 'cat': 0,
 'nlp': 0,
 'fisher': 0,
 'fish': 0,
 'algorithm': 0,
 'also': 0,
 'might': 0,
 'reduce': 0,
 'cats': 0,
 'fishes': 0,
 'identify': 0,
 'fished': 0,
 'useful': 0,
 'catty': 0}

In [11]:
# Now let's calculate our word frequencies. 
# We iterate over words in our sentences, for each word we update the count
for sentence in sentences_clean:
    for word in sentence:
        if word in vocab: 
            vocab_freq[word] +=1
vocab_freq = {k: v for k, v in sorted(vocab_freq.items(), key=lambda item: item[1], reverse=True)}
vocab_freq

{'stemming': 2,
 'stem': 2,
 'cats': 2,
 'strings': 1,
 'catch': 1,
 'catlike': 1,
 'algorithms': 1,
 'words': 1,
 'help': 1,
 'operating': 1,
 'english': 1,
 'fishing': 1,
 'stemmer': 1,
 'cat': 1,
 'nlp': 1,
 'fisher': 1,
 'fish': 1,
 'algorithm': 1,
 'also': 1,
 'might': 1,
 'reduce': 1,
 'fishes': 1,
 'identify': 1,
 'fished': 1,
 'useful': 1,
 'catty': 1}

# Stemming

In [12]:
# It exists many different stemmer. Try them ! 

In [13]:
my_list = [
    "cat", "cats", "lie", "lying", "run", "running", "city", "cities",
    "month", "monthly", "woman", "women", "better", "best", "are", "am"
]

In [14]:
# Porter Stemmer
porter = nltk.PorterStemmer()
for word in my_list:
    print(porter.stem(word))

cat
cat
lie
lie
run
run
citi
citi
month
monthli
woman
women
better
best
are
am


In [15]:
# Lancaster Stemmer
lancaster = nltk.LancasterStemmer()
for word in my_list:
    print(lancaster.stem(word))

cat
cat
lie
lying
run
run
city
city
mon
month
wom
wom
bet
best
ar
am


In [17]:
def stemSentence(sentence, stemmer):
    """ Take a string input (sentence preferably), and a specific stemmer.
        Outputs the sentence in a string format after having applied the stemmer
    """
    
    token_words = nltk.word_tokenize(sentence)
    stem_sentence = []
    
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    
    return "".join(stem_sentence)

In [18]:
# Compare differences

sentence="I have an important meeting today. The people I'm meeting with always make the right decisions"

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

i hav an import meet today . the peopl i 'm meet with alway mak the right decid 
i have an import meet today . the peopl i 'm meet with alway make the right decis 


In [20]:
# Look at what is happening on a french sentence
from nltk.stem.snowball import FrenchStemmer
french_stemmer = FrenchStemmer()

sentence="Ce matin je suis allé acheter une baguette à la Boulangerie puis je me suis régalé avant de venir en cours."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))
print(stemSentence(sentence, french_stemmer))

ce matin je sui allé achet un baguet à la boulangery pui je me sui régalé av de venir en cour . 
ce matin je sui allé achet une baguett à la boulangeri pui je me sui régalé avant de venir en cour . 
ce matin je suis allé achet une baguet à la boulanger puis je me suis régal avant de ven en cour . 


In [21]:
# Results may be hard to read 

## Effects on our Sentences 

In [22]:
# Let's see the effect of stemming on our vocabulary frequencies

In [23]:
stemmed_sentences = []
for sentence in sentences_clean:
    stem_sentence = []
    for word in sentence: 
        stem_sentence.append(lancaster.stem(word))
#         stem_sentence.append(lancaster.stem(word))
        
    stemmed_sentences.append(stem_sentence)

In [24]:
stemmed_sentences

[['stem',
  'engl',
  'op',
  'stem',
  'cat',
  'ident',
  'strings',
  'cat',
  'catlik',
  'catty'],
 ['stem',
  'algorithm',
  'might',
  'also',
  'reduc',
  'word',
  'fish',
  'fish',
  'fish',
  'stem',
  'fish'],
 ['stem', 'algorithm', 'us', 'nlp'],
 ['help', 'catch', 'cat', 'fish']]

In [25]:
#sentences

In [26]:
vocab = list(set([item for sentence in stemmed_sentences for item in sentence]))
print("Size of vocab:", len(vocab))
vocab_freq = {word:0 for word in vocab}

for sentence in stemmed_sentences:
    for word in sentence:
        if word in vocab: 
            vocab_freq[word] +=1
vocab_freq = {k: v for k, v in sorted(vocab_freq.items(), key=lambda item: item[1], reverse=True)}
vocab_freq

Size of vocab: 18


{'stem': 5,
 'fish': 5,
 'cat': 3,
 'algorithm': 2,
 'us': 1,
 'help': 1,
 'nlp': 1,
 'might': 1,
 'engl': 1,
 'catlik': 1,
 'reduc': 1,
 'strings': 1,
 'catch': 1,
 'word': 1,
 'catty': 1,
 'op': 1,
 'also': 1,
 'ident': 1}

In [27]:
# The stem "stem" appears in the first position, this subject is actually the one that is the most mentioned in the sentences. But it keeps stemmer as a different stem 
# The effect is also visible for the stem 'fish'

# Lemmatizer

In [28]:
nltk.download('wordnet')
nltk.download('omw-1.4')
wnlem = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [29]:
# my_list = ["cat","cats","lie","lying","run","running","city","cities","month","monthly","woman","women", 'better', "are"]

In [31]:
my_list

['cat',
 'cats',
 'lie',
 'lying',
 'run',
 'running',
 'city',
 'cities',
 'month',
 'monthly',
 'woman',
 'women',
 'better',
 'best',
 'are',
 'am']

In [30]:
for word in my_list:
    print(wnlem.lemmatize(word))

cat
cat
lie
lying
run
running
city
city
month
monthly
woman
woman
better
best
are
am


In [32]:
lemmatizer = nltk.WordNetLemmatizer()

# Create lematizing function


def lemmatize(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(
        lemmatizer.lemmatize(token, pos='a'), pos='v'), pos='n') for token in tokens]
    return " ".join(tokens)


# And display results
lemmer = lemmatize("I have an important meeting today. The people I'm meeting with always make the right decisions")
print(lemmer)

I have an important meet today . The people I 'm meet with always make the right decision


In [None]:
#  We recommend that you test different lemmatizers, and their parameters to observe their effects !  You can also use the nltk.pos_tag() method to find the "part of speech" tag

## Effect on our sentences

In [33]:
# Let's see the effect of lemmatizing on our vocaubulary frequencies

In [34]:
lem_sentences = []
lemmatizer = nltk.WordNetLemmatizer()
for sentence in sentences_clean:
    lem_sentence = []
    for token in sentence: 
        lem_sentence.append(lemmatizer.lemmatize(lemmatizer.lemmatize(
        lemmatizer.lemmatize(token, pos='a'), pos='v'), pos='n'))

        
    lem_sentences.append(lem_sentence)

In [35]:
lem_sentences

[['stemmer',
  'english',
  'operate',
  'stem',
  'cat',
  'identify',
  'string',
  'cat',
  'catlike',
  'catty'],
 ['stem',
  'algorithm',
  'might',
  'also',
  'reduce',
  'word',
  'fish',
  'fish',
  'fisher',
  'stem',
  'fish'],
 ['stem', 'algorithm', 'useful', 'nlp'],
 ['help', 'catch', 'cat', 'fish']]

In [36]:
vocab = list(set([item for sentence in lem_sentences for item in sentence]))
vocab_freq = {word:0 for word in vocab}
print("Size of vocab:", len(vocab))

for sentence in lem_sentences:
    for word in sentence:
        if word in vocab: 
            vocab_freq[word] +=1
vocab_freq = {k: v for k, v in sorted(vocab_freq.items(), key=lambda item: item[1], reverse=True)}
vocab_freq

Size of vocab: 20


{'stem': 4,
 'fish': 4,
 'cat': 3,
 'algorithm': 2,
 'operate': 1,
 'catch': 1,
 'catlike': 1,
 'help': 1,
 'word': 1,
 'string': 1,
 'english': 1,
 'stemmer': 1,
 'nlp': 1,
 'fisher': 1,
 'also': 1,
 'might': 1,
 'reduce': 1,
 'identify': 1,
 'useful': 1,
 'catty': 1}

In [37]:
# The results are differents here. 
# First the size of vocabulary is slightly larger 
# We can observe, that as expected, every words in the vocabulary is a proper english word

### Your turn ! 