# Word counts with bag-of-words
- Basic method for finding topics in a text
- Need to first create tokens using tokenization
- ... and then count up all the tokens
- **The more frequent a word, the more important it might be**

In [2]:
import zipfile

zip_path = "datasets/News%20articles.zip"  
destination_path = "datasets/news_article" 

# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # Extract all contents of the zip file to the destination folder
    zip_ref.extractall(destination_path)


In [33]:
file_path = "datasets/news_article/News articles/articles.txt"  

# Open the text file in read mode
with open(file_path, 'r') as file:
    # Read the contents of the file
    article = file.read()

In [34]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Import Counter
from collections import Counter

# Tokenize the article: tokens
tokens = word_tokenize(article)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
print(bow_simple.most_common(10))

[('the', 274), (',', 269), ('.', 189), ('to', 131), ('of', 119), ('a', 100), ('in', 99), ('and', 80), ('that', 67), ('’', 54)]


[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Simple text preprocessing
- Tokenization to create bag-of-words
- Lowercasing words
- Lemmatization/Stemming -> shorten words to their root stems
- Removing stop words, punctuations, unwanted tokens

- Remove stop words and non-alphabetic characters, lemmatize, and perform a new bag-of-words on your cleaned text.

You start with the same tokens you created in the last exercise: `lower_tokens`. 

In [4]:
file_path = 'datasets/english_stopwords.txt'

# Open the text file in read mode
with open(file_path, 'r') as file:
    # Read the contents of the file
    english_stop = file.read()

In [7]:
# Creating a list of stop_words
english_stop = english_stop.split('\n')

In [11]:
# Import WordNetLemmatizer and download WordNet resource
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stop]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

[nltk_data] Downloading package omw-1.4 to /home/repl/nltk_data...
[nltk_data] Downloading package wordnet to /home/repl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[('said', 29), ('robot', 28), ('population', 22), ('news', 19), ('human', 16), ('growth', 16), ('fake', 15), ('country', 14), ('united', 14), ('machine', 13)]


# Introduction to gensim
- Popular open source NLP library
- Uses top academic model to perform complex tasks:
    - Building document or word vectors
    - Performing topic identification or document comparison

## Word Vectors

![image-2](image-2.png)

With these vectors, we can then see relationships among the words or documents based on how near or far they are and also what similar comparisons we find. For example, in this graphic we can see that the vector operation king minus queen is approximately equal to man minus woman. Or that Spain is to Madrid as Italy is to Rome. 

In [35]:
# Creating the list of documents
article = article.split("\n\n\n")

In [38]:
import nltk
from nltk.tokenize import word_tokenize

tokenized_documents = []

# Tokenize each document in the list
for doc in article:
    tokens = word_tokenize(doc)
    tokenized_documents.append(tokens)

In [39]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(tokenized_documents)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f01d8773f40>

In [42]:
# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")
print(computer_id)

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

825
computer


In [45]:
# Create a MmCorpus: corpus 
# i.e for each tokenized document from the list, its creating bag of words (bow) --> for each tokenized document from the list, its creating the word with its frequencies
corpus = [dictionary.doc2bow(tok_doc) for tok_doc in tokenized_documents]

# Print the first 5 word ids with their frequency counts from the fifth document
print(corpus[4][:5])

[(2, 1), (14, 2), (18, 1), (27, 1), (50, 1)]


# Gensim bag-of-words

In [46]:
from collections import defaultdict
"""defaultdict means that if a key is not found in the dictionary,
then instead of a KeyError being thrown, a new entry is created. 
The type of this new entry is given by the argument of defaultdict"""

'defaultdict means that if a key is not found in the dictionary,\nthen instead of a KeyError being thrown, a new entry is created. \nThe type of this new entry is given by the argument of defaultdict'