# Playing with NLTK
## Dataset: COVID Fake News Data.csv

In [15]:
# importing libraries
import pandas as pd
import nltk
import numpy as np 
import matplotlib.pyplot as plt

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.text import Text
from nltk import FreqDist


In [16]:
data = pd.read_csv("COVID Fake News Data.csv")
data.columns

Index(['headlines', 'outcome'], dtype='object')

<p> Two headers: headlines, outcome (binary classification, real (1) or fake (0) </p>

In [17]:
data.shape

(10201, 2)

<p> There are 10201 news headlines total in this dataset. </p>

In [18]:
fake = data[data["outcome"] == 0]
fake.shape

(9727, 2)

In [19]:
real = data[data["outcome"] == 1]
real.shape

(474, 2)

In [20]:
data["outcome"].value_counts(normalize=True)

0    0.953534
1    0.046466
Name: outcome, dtype: float64

<p> 95% of the headlines in this dataset are classified as fake. </p>


In [21]:
data["headlines"].head()

0    A post claims compulsory vacination violates t...
1    A photo claims that this person is a doctor wh...
2    Post about a video claims that it is a protest...
3    All deaths by respiratory failure and pneumoni...
4    The dean of the College of Biologists of Euska...
Name: headlines, dtype: object

<p> Somehow explore key words in fake news headlines. </p>

### Playing around with NLTK

In [22]:
line = fake["headlines"]
line = [l for l in line]
line = " ".join(line)

# sent_tokenizer splits a string into sentences
# here, there is only one sentence, not needed
# word_tokenizer splits a string by word

# however, punctuation is also considered to be a word, and apostrophesized (?) words are split at the '


In [23]:
# filtering stop words
# stop words are words you want to ignore
# common stop words are "in". "is", "an"

words = word_tokenize(line)

# creating set of stopwords to filter
# nltk.download("stopwords")
stopwords = set(stopwords.words("english"))

# creating an empty list to hold the words that make it past the filter

filtered = []

# filtering stopwords, use casefold to ignore case

for word in words:
    if word.casefold() not in stopwords:
        filtered.append(word)

filtered


['post',
 'claims',
 'compulsory',
 'vacination',
 'violates',
 'principles',
 'bioethics',
 ',',
 'coronavirus',
 'doesnâ€™t',
 'exist',
 ',',
 'PCR',
 'test',
 'returns',
 'many',
 'false',
 'positives',
 ',',
 'influenza',
 'vaccine',
 'related',
 'COVID-19',
 '.',
 'photo',
 'claims',
 'person',
 'doctor',
 'died',
 'attending',
 'many',
 'COVID-19',
 'patinents',
 'Hospital',
 'MuÃ±iz',
 'Buenos',
 'Aires',
 '.',
 'Post',
 'video',
 'claims',
 'protest',
 'confination',
 'town',
 'Aranda',
 'de',
 'Duero',
 '(',
 'Burgos',
 ')',
 'deaths',
 'respiratory',
 'failure',
 'pneumonia',
 'registered',
 'COVID-19',
 ',',
 'according',
 'Civil',
 'Registry',
 'website',
 '.',
 'dean',
 'College',
 'Biologists',
 'Euskadi',
 'states',
 'lot',
 'PCR',
 'false',
 'positives',
 'asymptomatic',
 'donâ€™t',
 'spread',
 'coronavirus',
 '.',
 'Households',
 'COVID-19',
 'patients',
 'Porto',
 'Alegre',
 ',',
 'Campo',
 'Grande',
 'Santo',
 'AntÃ´nio',
 'da',
 'Platina',
 'must',
 'put',
 'red',
 

<p> However, keep in mind that some stopwords actually give proper context to the sentence. To bypass this, can edit the list of stopwords to make any necessary changes, then download it. </p>

In [24]:
# parts of speech
# grammatical term that deals with the roles words play when you use the together in sentences

nltk.pos_tag(words)



[('A', 'DT'),
 ('post', 'NN'),
 ('claims', 'VBZ'),
 ('compulsory', 'NN'),
 ('vacination', 'NN'),
 ('violates', 'VBZ'),
 ('the', 'DT'),
 ('principles', 'NNS'),
 ('of', 'IN'),
 ('bioethics', 'NNS'),
 (',', ','),
 ('that', 'IN'),
 ('coronavirus', 'NN'),
 ('doesnâ€™t', 'NNS'),
 ('exist', 'VBP'),
 (',', ','),
 ('that', 'IN'),
 ('the', 'DT'),
 ('PCR', 'NNP'),
 ('test', 'NN'),
 ('returns', 'NNS'),
 ('many', 'JJ'),
 ('false', 'JJ'),
 ('positives', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('that', 'IN'),
 ('influenza', 'JJ'),
 ('vaccine', 'NN'),
 ('is', 'VBZ'),
 ('related', 'VBN'),
 ('to', 'TO'),
 ('COVID-19', 'NNP'),
 ('.', '.'),
 ('A', 'NNP'),
 ('photo', 'NN'),
 ('claims', 'VBZ'),
 ('that', 'IN'),
 ('this', 'DT'),
 ('person', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('doctor', 'NN'),
 ('who', 'WP'),
 ('died', 'VBD'),
 ('after', 'IN'),
 ('attending', 'VBG'),
 ('to', 'TO'),
 ('too', 'RB'),
 ('many', 'JJ'),
 ('COVID-19', 'JJ'),
 ('patinents', 'NNS'),
 ('in', 'IN'),
 ('Hospital', 'NNP'),
 ('MuÃ±iz', 'N

<p>  All words in quote are now in a separate tuple, with a tag that represents their part of speech. </p>

<ul>
<li> JJ - adjectives   </li>
<li> NN - nouns </li>
<li> RB - adverbs </li>
<li> PRP - pronouns </li>
<li> VB - verb </li>
</ul>

In [None]:
# lemmatizing
# reducing a word to its root

# create a lemmatizer

lemma = WordNetLemmatizer()

# example, lemmatizing a plural noun

lemma.lemmatize("scarves")

In [None]:
lemma_words = [lemma.lemmatize(word) for word in words]
lemma_words

In [None]:
# what if you lemmatized a word that looked very different from its lemma?

lemma.lemmatize("worst")

In [None]:
# this happens because the default is to assume noun
# can indicate that you want "worst" to be a adjective

lemma.lemmatize("worst", pos="a")

<p> Thus, we can combine tagging parts of speech with lemmatizing. </p>

In [None]:
# chunking allows you to identify phrases (a group of words that function as a grammatical unit)
# nltk.download("averaged_perceptron_tagger")

pos_words = nltk.pos_tag(words)
pos_words

In [None]:
# need to define a chunk grammar
# chunk grammar == combination of rules on how sentences should be chunked, uses regex

grammar = "NP: {<DT>?<JJ>*<NN>}"
# --> starts with an optional determiner (article)
# --> can have any number of adjectives
# --> end with a noun

# NP  == noun phrase

# create a chunk parser with this grammar

chunk_parser = nltk.RegexpParser(grammar)

tree = chunk_parser.parse(pos_words)

tree.draw()

In [None]:
# chinking
# used together with chunking -- chinking is used to exclude a pattern

grammar = """
Chunk: {<.*>+}
        }<JJ>{"""
# {} indicate patterns you want to include
# }{ indicate patterns you want to exclude

# --> here, we include everything
# --> here, we exclude adjectives

chunk_parser = nltk.RegexpParser(grammar)

tree = chunk_parser.parse(pos_words)

tree.draw()

In [None]:
# Named Entity Recognition (NER)
# --> noun phrases that refer to specific locations, people, and organizations, etc.abs

# nltk.download("maxent_ne_chunker")
# nltk.download("words")

tree = nltk.ne_chunk(pos_words)

tree.draw()


In [None]:
# can also use the parameter binary=True if you want to know what the named entities are but not what kind

tree = nltk.ne_chunk(pos_words, binary=True)

tree.draw()

In [None]:
# can extract entities directly from text

def extract_ne(quote):
    words = word_tokenize(quote, language="english")
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=True)
    return set(
        " ".join(i[0] for i in t) for t in tree if hasattr(t, "label") and t.label() == "NE"
    )

extract_ne(line)


In [3]:
# using a concordance
# --> when you use a concordance, you can see each time a word is used, along with its immediate context

line

test = Text(sample_words) # takes in a list of words (tokenized)
test.concordance("COVID-19")



NameError: name 'line' is not defined

In [None]:
# making a dispersion plot --> seeing how often a particular word appears and where

test.dispersion_plot(["COVID-19", "vaccine", "deaths", "respiratory"])

In [1]:
# frequency distribution



NameError: name 'freqdist' is not defined

In [25]:
# finding collocations
# --> a sequence of words that show up often

test.collocations()

NameError: name 'test' is not defined

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6f4d88f7-58fe-4c32-b979-abb694c94f1b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>