# NLP Pipeline

In [88]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

## 1. Cleaning

In [25]:
r = requests.get("https://www.rottentomatoes.com/m/et_the_extraterrestrial/cast-and-crew")
r.text

'<!DOCTYPE html>\n<html lang="en" dir="ltr" xmlns="http://www.w3.org/1999/xhtml" prefix="fb: http://www.facebook.com/2008/fbml og: http://opengraphprotocol.org/schema/">\n    <head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">\n\n        \n        \n            <script\n                charset="UTF-8"\n                crossorigin="anonymous"\n                data-domain-script="7e979733-6841-4fce-9182-515fac69187f"\n                integrity="sha384-TKdmlzVmoD70HzftTw4WtOzIBL5mNx8mXSRzEvwrWjpIJ7FZ/EuX758yMDWXtRUN"\n                src="https://cdn.cookielaw.org/consent/7e979733-6841-4fce-9182-515fac69187f/otSDKStub.js"\n                type="text/javascript"\n            >\n            </script>\n            <script type="text/javascript">\n                function OptanonWrapper() { }\n            </script>\n        \n\n        \n            <script\n                ccpa-opt-out-ids="USP"\n                ccpa-opt-out-geo="US"\n             

In [26]:
#  remove HTML tags
soup = BeautifulSoup(r.text, "lxml")
soup.get_text()

"\n\n\n\n\n\n\n\n\n\nE.T. the Extra-Terrestrial | Cast and Crew | Rotten Tomatoes\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'\n\n\n        \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSigned in\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to Main Content\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        Cancel\n      \n\n\n\n\n\n\n\n        About Rotten Tomatoes®\n      \n\n\n\n        Critics\n      \n\n\n\n\n\n            Login/signup\n          \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          \xa0Wants to See\n        \n\n\n\n          \xa0Ratings\n        \nProfile\nAccount\nLog Out\n\n\n\n\n\n\n        Movies\n      \n\n\nMovies in theaters\n\n\nOpening This Week\n\n\nTop Box Office\n\n\nComing Soon to Theaters\n\n\nCertified Fresh Movies\n\n\n\n\nMovies at Home\n\n\nFandango at Home\n\n\nPeacock\n\n\nNetflix\n\n\nApple TV+\n\n\nPrime Video\n\n\nMost Popular Streaming movies\n\n\nCertified Fresh movies\n\n\nBrowse all\n\n\n\n\nM

In [29]:
# Find cast crew summaries
crew = soup.find_all('cast-and-crew-card')
print('Number of people in the cast crew:', len(crew))

Number of people in the cast crew: 20


In [30]:
# Inspect the first crew member to find tags for the member's name and role
# print the first summary in crew
print(crew[0].prettify())

<cast-and-crew-card data-castandcrewmanager="card" data-role="all,crew" mediaurl="/celebrity/steve_spielberg" skeleton="panel">
 <rt-img loading="lazy" slot="poster" src="https://resizing.flixster.com/y_qsV1N0tCBNNcdHhCPZqbH96so=/68x102/v2/https://resizing.flixster.com/-XZAfHZM39UwaGJIFWKAE8fS0ak=/v3/t/assets/1672_v9_bc.jpg">
 </rt-img>
 <rt-text context="label" slot="title">
  Steven Spielberg
 </rt-text>
 <rt-text slot="characters">
 </rt-text>
 <rt-text slot="credits">
  Director, Producer
 </rt-text>
</cast-and-crew-card>



In [37]:
# Look for tags that contain the actor/actress's name and the role that you want to extract. Then, use the find_all method on the crew object to pull out the html with those tags.
# Afterwards, don't forget to do some extra cleaning to isolate the names (get rid of unnecessary html).
# Extract name
crew[0].find_all('rt-text')[0].get_text().strip()

'Steven Spielberg'

In [39]:
# Extract role
crew[0].find_all('rt-text')[2].get_text().strip()

'Director, Producer'

In [40]:
# Collect names and roles of ALL memeber listings
name_role = []
for summary in crew:
    # append name and role of each summary to name_role list
    name = summary.find_all('rt-text')[0].get_text().strip()
    role = summary.find_all('rt-text')[2].get_text().strip()
    name_role.append((name, role))

print(len(name_role), "actors found in cast crew. Sample:")
name_role[:5]

20 actors found in cast crew. Sample:


[('Steven Spielberg', 'Director, Producer'),
 ('Henry Thomas', 'Actor'),
 ('Dee Wallace', 'Actor'),
 ('Peter Coyote', 'Actor'),
 ('Drew Barrymore', 'Actor')]

## 2. Normalization

In [41]:
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?


In [42]:
# Convert to lowercase
text = text.lower()
print(text)

the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?


In [44]:
# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", text)
print(text)

the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  


## 3. Tokenization

In [50]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/hieu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [48]:
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."
print(text)

Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.


In [51]:
# Split text into words using NLTK
words = word_tokenize(text)
print(words)

['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']


In [52]:
# Split text into sentences
sentences = sent_tokenize(text)
print(sentences)

['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']


## 4. Stop Words

In [54]:
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?


In [55]:
# Normalize text
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

In [56]:
# Tokenize text
words = word_tokenize(text)
print(words)

['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']


In [57]:
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'started', 'war', 'ai', 'bad', 'thing']


In [58]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## 5. POS and NER
### 5.1. POS

In [62]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/hieu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [63]:
text = "I always lie down to tell a lie."
# tokenize text
sentence = word_tokenize(text)
# tag each word with part of speech
pos_tag(sentence)

[('I', 'PRP'),
 ('always', 'RB'),
 ('lie', 'VBP'),
 ('down', 'RP'),
 ('to', 'TO'),
 ('tell', 'VB'),
 ('a', 'DT'),
 ('lie', 'NN'),
 ('.', '.')]

### 5.2. NER

In [67]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/hieu/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /home/hieu/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [68]:
text = "Antonio joined Udacity Inc. in California."
# tokenize, pos tag, then recognize named entities in text
tree = ne_chunk(pos_tag(word_tokenize(text)))
print(tree)

(S
  (PERSON Antonio/NNP)
  joined/VBD
  (ORGANIZATION Udacity/NNP Inc./NNP)
  in/IN
  (GPE California/NNP)
  ./.)


In [69]:
# Define a custom grammar
my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

In [70]:
# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


## 6. Stemming and Lemmatization

In [71]:
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"

# Normalize text
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

# Tokenize text
words = text.split()
print(words)

['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']


In [72]:
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'started', 'war', 'ai', 'bad', 'thing']


### 6.1. Stemming

In [74]:
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['first', 'time', 'see', 'second', 'renaiss', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definit', 'watch', 'part', '2', 'chang', 'view', 'matrix', 'human', 'peopl', 'one', 'start', 'war', 'ai', 'bad', 'thing']


### 6.2. Lemmatization

In [76]:
# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'one', 'started', 'war', 'ai', 'bad', 'thing']


## 7. Bag of Words and TF-IDF

In [77]:
corpus = ["The first time you see The Second Renaissance it may look boring.",
        "Look at it at least twice and definitely watch part 2.",
        "It will change your view of the matrix.",
        "Are the human people the ones who started the war?",
        "Is AI a bad thing ?"]

In [78]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [79]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # tokenize text
    tokens = word_tokenize(text)

    # lemmatize andremove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

### 7.1. `CountVectorizer` (Bag of Words)

In [82]:
# initialize count vectorizer object
vect = CountVectorizer(tokenizer=tokenize)

# get counts of each token (word) in text data
X = vect.fit_transform(corpus)

# convert sparse matrix to numpy array to view
X.toarray()

array([[0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0]])

In [83]:
# view token vocabulary and counts
vect.vocabulary_

{'first': 6,
 'time': 20,
 'see': 17,
 'second': 16,
 'renaissance': 15,
 'may': 11,
 'look': 9,
 'boring': 3,
 'least': 8,
 'twice': 21,
 'definitely': 5,
 'watch': 24,
 'part': 13,
 '2': 0,
 'change': 4,
 'view': 22,
 'matrix': 10,
 'human': 7,
 'people': 14,
 'one': 12,
 'started': 18,
 'war': 23,
 'ai': 1,
 'bad': 2,
 'thing': 19}

### 7.2. TF-IDF

In [85]:
# TfidfTransformer
# initialize tf-idf transformer object
transformer = TfidfTransformer(smooth_idf=False)

# use counts from count vectorizer results to compute tf-idf values
tfidf = transformer.fit_transform(X)

# convert sparse matrix to numpy array to view
tfidf.toarray()

array([[0.        , 0.        , 0.        , 0.36419547, 0.        ,
        0.        , 0.36419547, 0.        , 0.        , 0.26745392,
        0.        , 0.36419547, 0.        , 0.        , 0.        ,
        0.36419547, 0.36419547, 0.36419547, 0.        , 0.        ,
        0.36419547, 0.        , 0.        , 0.        , 0.        ],
       [0.39105193, 0.        , 0.        , 0.        , 0.        ,
        0.39105193, 0.        , 0.        , 0.39105193, 0.28717648,
        0.        , 0.        , 0.        , 0.39105193, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.39105193, 0.        , 0.        , 0.39105193],
       [0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.

In [89]:
# TfidfVectorizer
# initialize tf-idf vectorizer object
vectorizer = TfidfVectorizer()

# compute bag of word counts and tf-idf values
X = vectorizer.fit_transform(corpus)

# convert sparse matrix to numpy array to view
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.30298183, 0.        , 0.        , 0.30298183, 0.        ,
        0.        , 0.20291046, 0.        , 0.24444384, 0.        ,
        0.30298183, 0.        , 0.        , 0.        , 0.        ,
        0.30298183, 0.30298183, 0.30298183, 0.        , 0.40582093,
        0.        , 0.30298183, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.30298183, 0.        ],
       [0.        , 0.30015782, 0.        , 0.60031564, 0.        ,
        0.        , 0.        , 0.30015782, 0.        , 0.        ,
        0.        , 0.20101919, 0.30015782, 0.24216544, 0.        ,
        0.        , 0.        , 0.        , 0.30015782, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.30015782, 0.        , 0.        ,
        0.30015782, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.