In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Text Vectorization and Feature Engineering Assignment

In [34]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-win_amd64.whl (24.2 MB)
Collecting Cython==0.29.14
  Downloading Cython-0.29.14-cp37-cp37m-win_amd64.whl (1.7 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-3.0.0.tar.gz (113 kB)
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py): started
  Building wheel for smart-open (setup.py): finished with status 'done'
  Created wheel for smart-open: filename=smart_open-3.0.0-py3-none-any.whl size=107102 sha256=6bafcc5f9486e3e103b52a2fffb810b0c202691131a0f4caab73ae7a15459106
  Stored in directory: c:\users\jlim7\appdata\local\pip\cache\wheels\83\a6\12\bf3c1a667bde4251be5b7a3368b2d604c9af2105b5c1cb1870
Successfully built smart-open
Installing collected packages: Cython, smart-open, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.15
    Uninstalling Cython-0.29.15:
      Successfully uninstalled Cython-0.29.15
Successfully installed Cython-0.29.14 gensim-

<IPython.core.display.Javascript object>

In [35]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

<IPython.core.display.Javascript object>

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [3]:
articles = [
    "http://lite.cnn.io/en/article/h_eac18760a7a7f9a1bf33616f1c4a336d",
    "http://lite.cnn.io/en/article/h_de3f82f17d289680dd2b47c6413ebe7c",
    "http://lite.cnn.io/en/article/h_72f4dc9d6f35458a89af014b62e625ad",
    "http://lite.cnn.io/en/article/h_aa21fe6bf176071cb49e09d422c3adf0",
    "http://lite.cnn.io/en/article/h_8ad34a532921c9076cdc9d7390d2f1bc",
    "http://lite.cnn.io/en/article/h_84422c79110d9989177cfaf1c5f45fe7",
    "http://lite.cnn.io/en/article/h_d010d9580abac3a44c6181ec6fb63d58",
    "http://lite.cnn.io/en/article/h_fb11f4e9d7c5323e75b337d9e9e5e368",
    "http://lite.cnn.io/en/article/h_7b27f0b131067f8ece6238ac559670ab",
    "http://lite.cnn.io/en/article/h_8cae7f735fa9573d470f802063ceffe2",
    "http://lite.cnn.io/en/article/h_72c3668280e82576fcc2602b0fa70c14",
    "http://lite.cnn.io/en/article/h_d20658fb0e20212051cda0e0a7248c8a",
    "http://lite.cnn.io/en/article/h_56611c43d7928120d2ae21666ccc7417",
    "http://lite.cnn.io/en/article/h_bda0394e3c5ee7054ee65c022bca7695",
]

<IPython.core.display.Javascript object>

In [4]:
def html_to_text(html):
    TAGS = ["h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "li"]

    soup = BeautifulSoup(html, "lxml")
    text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
    text = " ".join(text_list)
    return text

<IPython.core.display.Javascript object>

In [5]:
!mkdir news_articles

<IPython.core.display.Javascript object>

In [10]:
path = "./news_articles/"

for index, url in enumerate(articles):
    response = requests.get(url)
    html = response.text
    text = html_to_text(html)

    with open(path + "article_" + str(index) + ".txt", "wb") as f:
        f.write(text.encode())

<IPython.core.display.Javascript object>

In [11]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

DOC_PATTERN = r".*\.txt"
news_corpus = PlaintextCorpusReader("news_articles", DOC_PATTERN)

<IPython.core.display.Javascript object>

In [12]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print(
        "Avg chars per word: " + str(round(len(corpus.raw()) / len(corpus.words()), 1))
    )
    print(
        "Avg words per sentence: "
        + str(round(len(corpus.words()) / len(corpus.sents()), 1))
    )


corpus_stats(news_corpus)

Corpus Statistics
Number of documents: 14
Number of paragraphs: 14
Number of sentences: 590
Number of words: 14159
Vocabulary: 2914
Avg chars per word: 5.0
Avg words per sentence: 24.0


<IPython.core.display.Javascript object>

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [13]:
docs = [news_corpus.raw(file_id) for file_id in news_corpus.fileids()]

<IPython.core.display.Javascript object>

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [14]:
# Word Tokenization
for doc in docs:
    word_token = word_tokenize(doc)
    print(word_token)

['Pink', 'taking', 'a', 'break', 'to', 'focus', 'on', 'family', '(', 'CNN', ')', '-', 'Pink', 'has', 'been', 'working', 'pretty', 'hard', 'and', 'it', 'sounds', 'like', 'she', 'will', 'be', 'taking', 'a', 'step', 'back', 'in', '2020', '.', 'Speaking', 'with', '``', 'Entertainment', 'Tonight', "''", 'on', 'the', 'Country', 'Music', 'Association', 'Awards', 'red', 'carpet', ',', 'the', 'singer', 'was', 'joined', 'by', 'her', 'husband', ',', 'Carey', 'Hart', ',', 'and', 'their', 'kids', 'Willow', ',', '8', ',', 'and', 'Jameson', ',', '2', '.', 'Pink', 'was', 'there', 'to', 'perform', 'her', 'song', '``', 'Love', 'Me', 'Anyway', "''", 'with', 'country', 'star', 'Chris', 'Stapleton', ',', 'and', 'she', 'talked', 'about', 'how', 'hectic', 'things', 'have', 'been', '.', '``', 'We', 'did', 'two', 'and', 'a', 'half', 'years', 'of', '[', 'music', ']', 'and', 'Willow', "'s", 'back', 'in', 'school', 'now', ',', 'Jameson', "'s", 'going', 'to', 'start', 'pre-school', 'soon', ',', "''", 'Pink', 'said

<IPython.core.display.Javascript object>

In [15]:
# Lowercase
lowercase = [token.lower() for token in word_token]
print(lowercase)



<IPython.core.display.Javascript object>

In [19]:
# Removing Stopwords
no_stopwords = [token for token in lowercase if not token in stopwords.words("english")]
print(no_stopwords)



<IPython.core.display.Javascript object>

In [20]:
# Remove Punctuation
no_punct = [token for token in no_stopwords if token.isalpha() == True]
print(no_punct)



<IPython.core.display.Javascript object>

In [22]:
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jlim7\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

<IPython.core.display.Javascript object>

In [23]:
# Lemmatize
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in no_punct]
print(lemmatized)



<IPython.core.display.Javascript object>

In [24]:
# Stemming
stemmer = SnowballStemmer("english")
stemmed = [stemmer.stem(token) for token in no_punct]
print(stemmed)

['student', 'protest', 'fortifi', 'campus', 'occup', 'hong', 'kong', 'brace', 'violenc', 'hong', 'kong', 'cnn', 'threat', 'violenc', 'hung', 'hong', 'kong', 'thursday', 'even', 'thousand', 'student', 'protest', 'prepar', 'face', 'riot', 'polic', 'amid', 'continu', 'occup', 'sever', 'citi', 'prestigi', 'univers', 'level', 'unrest', 'destruct', 'almost', 'protest', 'movement', 'reach', 'new', 'unnerv', 'height', 'recent', 'day', 'sever', 'peopl', 'critic', 'injur', 'chines', 'state', 'media', 'warn', 'radic', 'protest', 'edg', 'doom', 'sporad', 'clash', 'broke', 'thursday', 'morn', 'polic', 'fire', 'tear', 'gas', 'protest', 'near', 'hong', 'kong', 'polytechn', 'univers', 'kowloon', 'hong', 'kong', 'island', 'road', 'surround', 'univers', 'hong', 'kong', 'block', 'protest', 'result', 'traffic', 'delay', 'univers', 'emerg', 'new', 'focal', 'point', 'protest', 'movement', 'numer', 'campus', 'across', 'citi', 'becom', 'home', 'heavili', 'fortifi', 'temporari', 'protest', 'camp', 'chines', 'u

<IPython.core.display.Javascript object>

### Count vectorize the preprocessed documents.

In [29]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(docs)

count = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())

<IPython.core.display.Javascript object>

### One hot vectorize the preprocessed documents.

In [30]:
vectorizer = CountVectorizer(binary=True)
vectors = vectorizer.fit_transform(docs)

one_hot = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())

<IPython.core.display.Javascript object>

### TF-IDF vectorize the preprocessed documents.

In [32]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(docs)

tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())

<IPython.core.display.Javascript object>

### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [39]:
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]

model = Doc2Vec(tagged_documents)

doc2vec = pd.DataFrame(
    [[document] + list(model[document]) for document in range(len(tagged_documents))]
).drop(0, axis=1)

<IPython.core.display.Javascript object>