# Dr. Jennifer Sleeman
# Exploring Term Extraction Methods
# jennifer.sleeman@northwestern.edu


In [1]:
#Just in case you need to install the appropriate packages uncomment these lines
!pip3 install phrasemachine
!pip3 install nltk
!pip3 install rake_nltk


Collecting phrasemachine
  Downloading phrasemachine-1.0.7.tar.gz (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 2.2 MB/s eta 0:00:01
Building wheels for collected packages: phrasemachine
  Building wheel for phrasemachine (setup.py) ... [?25ldone
[?25h  Created wheel for phrasemachine: filename=phrasemachine-1.0.7-py3-none-any.whl size=2694881 sha256=a7158aa9235613f2f81d49f470e60927c8a6fd6adfbfb0aacf18ba760447c1ee
  Stored in directory: /Users/kevhhu/Library/Caches/pip/wheels/2d/9e/9c/e59fe753d4541789d76201dc96d02927baccc82069679097dc
Successfully built phrasemachine
Installing collected packages: phrasemachine
Successfully installed phrasemachine-1.0.7
Collecting rake_nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Collecting nltk<4.0.0,>=3.6.2
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 2.3 MB/s eta 0:00:01
Collecting regex>=2021.8.3
  Downloading regex-2023.12.25-cp38-cp38-macosx_10_9_x86_64.whl (29

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/kevhhu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import phrasemachine
import nltk
from rake_nltk import Rake
import re
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import ngrams, FreqDist

In [4]:
# Only run this once, they will be downloaded.
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevhhu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/kevhhu/nltk_data...


True

In [8]:
# Examples
document1 = """ For their sauces alone, I'm willing to say Via Rosa puts out some of the best-tasting food on the island. All of the sauces I've tried there taste like something homemade, like they took a good deal of time and effort to make. Definitely a step above the average around here. That said, for the price, the pizza and pasta itself fell a little bit short of expectations.

The pizza dough itself is very thin, very crisp, and has a nice flavor. Their marinara, like all their other sauces, is bright and tasty. Despite this, their margherita pizza was seriously underwhelming - the way the basil was on the pizza was a little sad but the real killer was the apparent lack of fresh mozzarella - it tasted a lot more like a parmesan pizza than anything else. Just seemed like a really strange thing to skimp out on, and based on other pictures it seems like going super light on the cheese may just be the standard here. In any case, I think there are better places to go for pizza on the island, thin-crust or otherwise (especially for the price!!)

I didn't really take any particular issue with the noodles themselves (they weren't *unpleasant* to eat or anything) but the texture wasn't amazing and they've just never really impressed. Additionally, it's a little strange how long it took to get food given how fast fresh pasta takes to cook, their having sauces pre-prepared, and the size of their kitchen. But if you order ahead of time this probably isn't much of an issue - they seem to be doing a lot of take-out business.

Cannoli was ok, not very sweet. Nice of them to have it at such a low price, one of my dining companions felt sure they were store-bought but I would probably still get one again.

Altogether, it's far from bad food, and the location is pretty nice - they give you a good deal of options for take-home food, I like their "market" section. I'm not sure if I'd describe it as a "must-try" place - even at its best, it feels expensive for what you get. Still, I'd say it's one of the better restaurants on Bainbridge if money is no object. Kind of an iconic little location (Rolling Bay) to visit off the beaten path if you're a tourist. People definitely like this place for a reason."""

In [9]:
# Create a list of stop words from nltk
stop_words = set(stopwords.words("english"))


In [10]:
# Pre-process dataset to remove punctuation
def remove_punctuation(in_text):
    # Remove punctuation
    text = re.sub('[^a-zA-Z]', ' ', str(in_text))
    return text

In [11]:
# Pre-process dataset to lower case it
def lower_case(in_text):
    # Convert to lowercase
    text = in_text.lower()    
    return text

In [12]:
# Pre-process dataset to remove tags
def remove_tags(in_text):    
    # Remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",in_text)
    return text

In [13]:
# Pre-process dataset to remove special characters and digits
def remove_special_chars_and_digits(in_text):
    # Remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",in_text)
    return text


In [14]:
# Pre-process dataset to appy Stemming
def apply_stemming(in_text):
    stemmer=PorterStemmer()
    word_list = nltk.word_tokenize(in_text)
    output = ' '.join([stemmer.stem(w) for w in word_list])
    return output

In [15]:
# Pre-process dataset to apply Lemmatization
def apply_lemmatization(in_text):
    # Lemmatization
    lem = WordNetLemmatizer()
    word_list = nltk.word_tokenize(in_text)
    output = ' '.join([lem.lemmatize(w) for w in word_list])
    return output

In [16]:
# Remove stop words
def remove_stop_words(in_text):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(in_text)  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 

    return filtered_sentence

In [17]:
# Run Phase Machine
def run_phrase_machine(in_text):
    phrases=phrasemachine.get_phrases(in_text)
    return phrases

In [18]:
#Run Rake Keyword Extractor
def run_rake(in_text):
    r = Rake()
    r.extract_keywords_from_text(in_text)
    rake_phrases= r.get_ranked_phrases()
    return rake_phrases

In [19]:
# Run NLTK Tokenizer
def run_nltk_tokenizer(in_text):
    tokens=nltk.word_tokenize(in_text)
    return tokens

In [20]:
# Run NLTK Sentence Tokenizer
def run_nltk_sent_tokenizer(in_corpus):
    sents = nltk.sent_tokenize(in_corpus)
    return sents

In [21]:
#Run word-ngram Tokenizer
def run_nltk_tokenizer_word_ngrams(in_text, ngram_size):
    n_grams = ngrams(nltk.word_tokenize(in_text), ngram_size)
    return [ ' '.join(grams) for grams in n_grams]

In [22]:
#Get Frequ Dist 
def get_freq_dist(terms):
    all_counts = dict()
    all_counts = FreqDist(terms)
    return all_counts

In [23]:
#Run this first to get sentences from text.
sentences=run_nltk_sent_tokenizer(document1)

In [26]:
#Explore different extractors and difference preprocessing techniques
for sentence in sentences:
    print(sentence)
    print("===================NLTK Tokenizer===================")
    print(run_nltk_tokenizer(sentence))
    print("===================NLTK Word NGRAM Tokenizer 2 words===================")
    print(run_nltk_tokenizer_word_ngrams(sentence,2))
    print("===================NLTK Word NGRAM Tokenizer 3 words===================")
    print(run_nltk_tokenizer_word_ngrams(sentence,3))
    print("===================Phrase Machine===================")
    phrases=run_phrase_machine(sentence)
    for term in phrases["counts"].keys():
        print(term)
    print("===================Rake===================")
    print(run_rake(sentence))
    print("===================NLTK Tokenizer===================")
    print(run_nltk_tokenizer((sentence)))
    print("===================NLTK Tokenizer LOWER CASE===================")
    print(run_nltk_tokenizer(lower_case(sentence)))
    print("===================NLTK Tokenizer REMOVE STOP WORDS===================")
    print(remove_stop_words(sentence))   
    print("===================NLTK Tokenizer REMOVED PUNCTUATION===================")
    print(run_nltk_tokenizer(remove_punctuation(sentence)))
    print("===================NLTK Tokenizer REMOVED TAGS===================")
    print(run_nltk_tokenizer(remove_tags(sentence)))
    print("===================NLTK Tokenizer REMOVED CHARS AND DIGITS===================")
    print(run_nltk_tokenizer(remove_special_chars_and_digits(sentence)))
    print("===================NLTK Tokenizer STEMMING APPLIED===================")
    print(run_nltk_tokenizer(apply_stemming(sentence)))
    print("===================NLTK Tokenizer LEMMATIZATION APPLIED===================")
    print(run_nltk_tokenizer(apply_lemmatization(sentence)))
    #break

 For their sauces alone, I'm willing to say Via Rosa puts out some of the best-tasting food on the island.
['For', 'their', 'sauces', 'alone', ',', 'I', "'m", 'willing', 'to', 'say', 'Via', 'Rosa', 'puts', 'out', 'some', 'of', 'the', 'best-tasting', 'food', 'on', 'the', 'island', '.']
['For their', 'their sauces', 'sauces alone', 'alone ,', ', I', "I 'm", "'m willing", 'willing to', 'to say', 'say Via', 'Via Rosa', 'Rosa puts', 'puts out', 'out some', 'some of', 'of the', 'the best-tasting', 'best-tasting food', 'food on', 'on the', 'the island', 'island .']
['For their sauces', 'their sauces alone', 'sauces alone ,', 'alone , I', ", I 'm", "I 'm willing", "'m willing to", 'willing to say', 'to say Via', 'say Via Rosa', 'Via Rosa puts', 'Rosa puts out', 'puts out some', 'out some of', 'some of the', 'of the best-tasting', 'the best-tasting food', 'best-tasting food on', 'food on the', 'on the island', 'the island .']
via rosa
best-tasting food
best-tasting food on the island
food on th

In [27]:
#Explore different extractors and difference preprocessing techniques
all_terms=[]
for sentence in sentences:
    print(sentence)
    #pick your favorite term extractor
    all_terms = all_terms +run_rake(sentence)
#get the frequency distribution across the terms
fd=get_freq_dist(all_terms)
fd


 For their sauces alone, I'm willing to say Via Rosa puts out some of the best-tasting food on the island.
All of the sauces I've tried there taste like something homemade, like they took a good deal of time and effort to make.
Definitely a step above the average around here.
That said, for the price, the pizza and pasta itself fell a little bit short of expectations.
The pizza dough itself is very thin, very crisp, and has a nice flavor.
Their marinara, like all their other sauces, is bright and tasty.
Despite this, their margherita pizza was seriously underwhelming - the way the basil was on the pizza was a little sad but the real killer was the apparent lack of fresh mozzarella - it tasted a lot more like a parmesan pizza than anything else.
Just seemed like a really strange thing to skimp out on, and based on other pictures it seems like going super light on the cheese may just be the standard here.
In any case, I think there are better places to go for pizza on the island, thin-cr

FreqDist({'like': 4, 'pizza': 3, 'island': 2, 'best': 2, 'good deal': 2, 'took': 2, 'time': 2, 'sauces': 2, 'thin': 2, 'lot': 2, ...})