# Tokenization using spaCy

In [2]:
pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Initialize string
string = "Hello! I don't know what I'm doing here."

# Create a Doc object
doc = nlp(string)

# Generate list of tokens
tokens = [token.text for token in doc]
print(tokens)

['Hello', '!', 'I', 'do', "n't", 'know', 'what', 'I', "'m", 'doing', 'here', '.']


# Lemmatization using spaCy
- Convert word into its base form
    - reducing, reduces, reduced, reduction --> reduce
    - am, are, is --> be
    - n't --> not

In [10]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Initialize string
string = "Hello! I don't know what I'm doing here."

# Create a Doc object
doc = nlp(string)

# Generate list of lemmas
lemmas = [token.lemma_ for token in doc]
print(lemmas)

['hello', '!', 'I', 'do', 'not', 'know', 'what', 'I', 'be', 'do', 'here', '.']


# Text cleaning techniques
- Converting words into lowercase
- Removing leading and trailing whitespaces
- Removing punctuations
- Removing stopwords
- Expanding contractions
- Removing special characters and emojis

In [12]:
import pandas as pd
ted = pd.read_csv('datasets/ted.csv')
ted.head()

Unnamed: 0,transcript,url
0,"We're going to talk — my — a new lecture, just...",https://www.ted.com/talks/al_seckel_says_our_b...
1,"This is a representation of your brain, and yo...",https://www.ted.com/talks/aaron_o_connell_maki...
2,It's a great honor today to share with you The...,https://www.ted.com/talks/carter_emmart_demos_...
3,"My passions are music, technology and making t...",https://www.ted.com/talks/jared_ficklin_new_wa...
4,It used to be that if you wanted to get a comp...,https://www.ted.com/talks/jeremy_howard_the_wo...


In [13]:
# Get the stopwords for English
stopwords = nlp.Defaults.stop_words

In [14]:
# Function to preprocess text
def preprocess(text):
  	# Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]
    
    return ' '.join(a_lemmas)
  
# Apply preprocess to ted['transcript']
ted['transcript'] = ted['transcript'].apply(preprocess)
print(ted['transcript'])

0      talk new lecture TED I illusion create TED I t...
1      representation brain brain break left half log...
2      great honor today share Digital Universe creat...
3      passion music technology thing combination thi...
4      use want computer new program programming requ...
                             ...                        
495    today I unpack example iconic design perfect s...
496    brother belong demographic Pat percent accord ...
497    John Hockenberry great Tom I want start questi...
498    right moment kill More car internet little mob...
499    real problem math education right basically ha...
Name: transcript, Length: 500, dtype: object


# Parts of Speech Tagging

In [15]:
lotf = 'He found himself understanding the wearisomeness of this life, where every path was an improvisation and a considerable part of one’s waking life was spent watching one’s feet.'

In [16]:
# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(lotf)

# Generate tokens and pos tags
pos = [(token.text, token.pos_) for token in doc]
print(pos)

[('He', 'PRON'), ('found', 'VERB'), ('himself', 'PRON'), ('understanding', 'VERB'), ('the', 'DET'), ('wearisomeness', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('life', 'NOUN'), (',', 'PUNCT'), ('where', 'SCONJ'), ('every', 'DET'), ('path', 'NOUN'), ('was', 'AUX'), ('an', 'DET'), ('improvisation', 'NOUN'), ('and', 'CCONJ'), ('a', 'DET'), ('considerable', 'ADJ'), ('part', 'NOUN'), ('of', 'ADP'), ('one', 'NUM'), ('’s', 'NUM'), ('waking', 'VERB'), ('life', 'NOUN'), ('was', 'AUX'), ('spent', 'VERB'), ('watching', 'VERB'), ('one', 'NUM'), ('’s', 'NUM'), ('feet', 'NOUN'), ('.', 'PUNCT')]


In [17]:
nlp = spacy.load('en_core_web_sm')

# Returns number of proper nouns
def proper_nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of proper nouns
    return pos.count('PROPN')

print(proper_nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

3


## Noun and Propernoun usage in fake news

In [18]:
headlines = pd.read_csv('datasets/fakenews.csv')
headlines.head()

Unnamed: 0.1,Unnamed: 0,title,label
0,0,You Can Smell Hillary’s Fear,FAKE
1,1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,2,Kerry to go to Paris in gesture of sympathy,REAL
3,3,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,4,The Battle of New York: Why This Primary Matters,REAL


In [36]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

def count_propernouns(string, model=nlp):
    # Create doc object
    doc = model(string)
    # POS tagging
    pos = [token.pos_ for token in doc]
    # return count of propernouns(PROPN)
    return (pos.count('PROPN'))

In [37]:
headlines['num_pronouns'] = headlines['title'].apply(count_propernouns)
headlines.head()

Unnamed: 0.1,Unnamed: 0,title,label,num_pronouns,num_nouns
0,0,You Can Smell Hillary’s Fear,FAKE,2,1
1,1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE,10,0
2,2,Kerry to go to Paris in gesture of sympathy,REAL,2,2
3,3,Bernie supporters on Twitter erupt in anger ag...,FAKE,3,3
4,4,The Battle of New York: Why This Primary Matters,REAL,4,0


In [38]:
def count_nouns(string, model=nlp):
    # Create doc object
    doc = nlp(string)
    # POS tagging
    pos = [token.pos_ for token in doc]
    
    # Return count of nouns
    return (pos.count('NOUN'))

In [39]:
headlines['num_nouns'] = headlines['title'].apply(count_nouns)
headlines.head()

Unnamed: 0.1,Unnamed: 0,title,label,num_pronouns,num_nouns
0,0,You Can Smell Hillary’s Fear,FAKE,2,1
1,1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE,10,0
2,2,Kerry to go to Paris in gesture of sympathy,REAL,2,2
3,3,Bernie supporters on Twitter erupt in anger ag...,FAKE,3,3
4,4,The Battle of New York: Why This Primary Matters,REAL,4,0


In [40]:
# Summarizing and comparing noun and propernoun usuage
fake_news_PROPN = headlines[headlines['label']=='FAKE']['num_pronouns'].mean()
real_news_PROPN = headlines[headlines['label']=='REAL']['num_pronouns'].mean()

fake_news_PROPN, real_news_PROPN

(4.813953488372093, 2.3684210526315788)

Fake news seems to use more pronouns, compared to real news.

In [41]:
# Summarizing and comparing noun and propernoun usuage
fake_news_NOUN = headlines[headlines['label']=='FAKE']['num_nouns'].mean()
real_news_NOUN = headlines[headlines['label']=='REAL']['num_nouns'].mean()

fake_news_NOUN, real_news_NOUN

(1.6046511627906976, 2.3859649122807016)

Fake news seems to use lesser nouns, compared to real news. 

# Named Entity Recognition

In [42]:
# Load the required model
nlp = spacy.load('en_core_web_sm')

# Create a Doc instance 
text = 'Sundar Pichai is the CEO of Google. Its headquarters is in Mountain View.'
doc = nlp(text)

# Print all named entities and their labels
for ent in doc.ents:
    print(ent.text, ent.label_)

Sundar Pichai PERSON
Google ORG
Mountain View GPE


## Identifying people mentioned in news article

In [43]:
tc = "\nIt’s' been a busy day for Facebook  exec op-eds. Earlier this morning, Sheryl Sandberg broke the site’s silence around the Christchurch massacre, and now Mark Zuckerberg is calling on governments and other bodies to increase regulation around the sorts of data Facebook traffics in. He’s hoping to get out in front of heavy-handed regulation and get a seat at the table shaping it.\n"

In [44]:
def find_persons(text):
  # Create Doc object
  doc = nlp(text)
  
  # Identify the persons
  persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
  
  # Return persons
  return persons

print(find_persons(tc))

['Facebook', 'Sheryl Sandberg', 'Mark Zuckerberg', 'Facebook']
