# Word counts with bag-of-words
- Basic method for finding topics in a text
- Need to first create tokens using tokenization
- ... and then count up all the tokens
- **The more frequent a word, the more important it might be**

In [2]:
import zipfile

zip_path = "datasets/News%20articles.zip"  
destination_path = "datasets/news_article" 

# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # Extract all contents of the zip file to the destination folder
    zip_ref.extractall(destination_path)


In [1]:
file_path = "datasets/news_article/News articles/articles.txt"  

# Open the text file in read mode
with open(file_path, 'r') as file:
    # Read the contents of the file
    article = file.read()

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Import Counter
from collections import Counter

# Tokenize the article: tokens
tokens = word_tokenize(article)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
print(bow_simple.most_common(10))

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[('the', 274), (',', 269), ('.', 189), ('to', 131), ('of', 119), ('a', 100), ('in', 99), ('and', 80), ('that', 67), ('’', 54)]


# Simple text preprocessing
- Tokenization to create bag-of-words
- Lowercasing words
- Lemmatization/Stemming -> shorten words to their root stems
- Removing stop words, punctuations, unwanted tokens

- Remove stop words and non-alphabetic characters, lemmatize, and perform a new bag-of-words on your cleaned text.

You start with the same tokens you created in the last exercise: `lower_tokens`. 

In [3]:
file_path = 'datasets/english_stopwords.txt'

# Open the text file in read mode
with open(file_path, 'r') as file:
    # Read the contents of the file
    english_stop = file.read()

In [4]:
# Creating a list of stop_words
english_stop = english_stop.split('\n')

In [5]:
# Import WordNetLemmatizer and download WordNet resource
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stop]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

[nltk_data] Downloading package omw-1.4 to /home/repl/nltk_data...
[nltk_data] Downloading package wordnet to /home/repl/nltk_data...


[('said', 29), ('robot', 28), ('population', 22), ('news', 19), ('human', 16), ('growth', 16), ('fake', 15), ('country', 14), ('united', 14), ('machine', 13)]


# Introduction to gensim
- Popular open source NLP library
- Uses top academic model to perform complex tasks:
    - Building document or word vectors
    - Performing topic identification or document comparison

## Word Vectors

![image-2](image-2.png)

A word embedding or vector is trained from a larger corpus and is a multi-dimensional representation of a word or document. You can think of it as a multi-dimensional array normally with sparse features (lots of zeros and some ones). With these vectors, we can then see relationships among the words or documents based on how near or far they are and also what similar comparisons we find. For example, in this graphic we can see that the vector operation king minus queen is approximately equal to man minus woman. Or that Spain is to Madrid as Italy is to Rome. 

In [6]:
# Creating the list of documents
article = article.split("\n\n\n")

In [40]:
import nltk
from nltk.tokenize import word_tokenize

tokenized_documents = []

# Tokenize each document in the list
for doc in article:
    # Lowercase each doc
    lower_tokens = word_tokenize(doc.lower())
    
    # Retain alphabetic words: alpha_only
    alpha_only = [t for t in lower_tokens if t.isalpha()]
    
    # Remove all stop words: no_stops
    no_stops = [t for t in alpha_only if t not in english_stop]
    
    # Instantiate the WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # Lemmatize all tokens into a new list: lemmatized
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
    
    tokenized_documents.append(lemmatized)

In [41]:
# First 3 tokenized documents in the list
tokenized_documents[:3]

[['copyright',
  'epa',
  'image',
  'caption',
  'uber',
  'criticised',
  'many',
  'time',
  'way',
  'run',
  'business'],
 ['firm', 'uber', 'facing', 'criminal', 'investigation', 'u', 'government'],
 ['scrutiny',
  'started',
  'firm',
  'accused',
  'using',
  'secret',
  'software',
  'let',
  'operate',
  'region',
  'banned',
  'restricted']]

In [42]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(tokenized_documents)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f536d606b50>

**Dictionary** class create a mapping with an id for each token, which is the beginning of corpus (helps with NLP). Now we can represent whole document using just a list of their token ids and how often these tokens appear in each document.

In [43]:
dictionary.token2id #<-- token2id attribute to look, tokens with their respective ids

{'business': 0,
 'caption': 1,
 'copyright': 2,
 'criticised': 3,
 'epa': 4,
 'image': 5,
 'many': 6,
 'run': 7,
 'time': 8,
 'uber': 9,
 'way': 10,
 'criminal': 11,
 'facing': 12,
 'firm': 13,
 'government': 14,
 'investigation': 15,
 'u': 16,
 'accused': 17,
 'banned': 18,
 'let': 19,
 'operate': 20,
 'region': 21,
 'restricted': 22,
 'scrutiny': 23,
 'secret': 24,
 'software': 25,
 'started': 26,
 'using': 27,
 'called': 28,
 'greyball': 29,
 'helped': 30,
 'identify': 31,
 'official': 32,
 'running': 33,
 'seeking': 34,
 'service': 35,
 'stop': 36,
 'agency': 37,
 'comment': 38,
 'declined': 39,
 'news': 40,
 'reported': 41,
 'reuters': 42,
 'spokesman': 43,
 'approval': 44,
 'area': 45,
 'claimed': 46,
 'including': 47,
 'oregon': 48,
 'portland': 49,
 'ride': 50,
 'several': 51,
 'still': 52,
 'used': 53,
 'blocked': 54,
 'booking': 55,
 'bid': 56,
 'company': 57,
 'driver': 58,
 'illegally': 59,
 'operating': 60,
 'passenger': 61,
 'posed': 62,
 'prove': 63,
 'regulation': 64,
 

In [44]:
# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer") 
print(computer_id)

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

351
computer


## Using the **_dictionary_** creating **_Gensim Corpus_**
- Gensim uses a simple bag-of-words model which transforms each document into a bag of words using the token ids and the frequency of each token in the document.

In [45]:
# Create a MmCorpus: corpus 
corpus = [dictionary.doc2bow(tok_doc) for tok_doc in tokenized_documents]

# Print the first 5 word ids with their frequency counts from the fifth document
print(corpus[4][:5])

[(9, 1), (15, 1), (37, 1), (38, 1), (39, 1)]


Gensim corpus is a list of lists, each list item representing one document. Each document a series of tuples, the first item representing the tokenid from the dictionary and the second item representing the token frequency in the document.

# Gensim bag-of-words

In [46]:
from collections import defaultdict
"""defaultdict means that if a key is not found in the dictionary,
then instead of a KeyError being thrown, a new entry is created. 
The type of this new entry is given by the argument of defaultdict"""

'defaultdict means that if a key is not found in the dictionary,\nthen instead of a KeyError being thrown, a new entry is created. \nThe type of this new entry is given by the argument of defaultdict'

In [50]:
# Save the 66th document: doc
doc = corpus[65]
print(doc)

print()

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
print(bow_doc)

[(28, 1), (35, 1), (45, 1), (75, 1), (234, 1), (235, 1), (353, 1), (369, 1), (378, 2), (415, 1), (416, 1), (441, 1), (568, 2), (660, 1), (661, 1), (663, 1), (664, 1), (665, 1), (666, 1), (667, 1), (668, 1), (669, 1), (670, 2), (671, 1), (672, 1), (673, 1), (674, 1), (675, 1), (676, 1), (677, 1), (678, 1)]

[(378, 2), (568, 2), (670, 2), (28, 1), (35, 1), (45, 1), (75, 1), (234, 1), (235, 1), (353, 1), (369, 1), (415, 1), (416, 1), (441, 1), (660, 1), (661, 1), (663, 1), (664, 1), (665, 1), (666, 1), (667, 1), (668, 1), (669, 1), (671, 1), (672, 1), (673, 1), (674, 1), (675, 1), (676, 1), (677, 1), (678, 1)]


**Note:**

The syntax of sorted: sorted(iterable, key=None, reverse=False)

- key : A function that serves as a key for the sort comparison.

In [51]:
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)

traditional 2
project 2
part 2
called 1
service 1


In [52]:
# Import the itertools module
import itertools

# Create the defaultdict: "total_word_count" in which the keys are all the token ids (word_id) and the values are the sum of their occurrence across all documents (word_count)
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

In [53]:
# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

said 29
robot 28
population 22
news 19
human 16


# Tf-idf with gensim

TF-IDF Formula: 

![image-3](image-3.png)

The weight of token i in document j is calculated by taking the term frequency (or how many times the token appears in the document) multiplied by the log of the total number of documents divided by the number of documents that contain the same term.

In [56]:
from gensim.models.tfidfmodel import TfidfModel

# Passing corpus to TfidfModel : tfidf
tfidf = TfidfModel(corpus)
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x7f536d206280>

In [60]:
# Reference each document by using it like a dictionary key with our new tfidf model
tfidf[corpus[76]]

[(35, 0.08402491269879196),
 (88, 0.08402491269879196),
 (93, 0.11013281577999172),
 (122, 0.09439267703990184),
 (127, 0.07303463961632564),
 (152, 0.07628112737462436),
 (162, 0.07303463961632564),
 (163, 0.11013281577999172),
 (164, 0.2191039188489769),
 (171, 0.08877477835641555),
 (193, 0.1348334818656787),
 (202, 0.09439267703990184),
 (278, 0.05325710938059205),
 (305, 0.11013281577999172),
 (312, 0.10126842933820669),
 (350, 0.10126842933820669),
 (403, 0.09439267703990184),
 (405, 0.08877477835641555),
 (433, 0.1226264667617829),
 (597, 0.11013281577999172),
 (643, 0.11013281577999172),
 (663, 0.08877477835641555),
 (715, 0.11013281577999172),
 (724, 0.11013281577999172),
 (734, 0.09439267703990184),
 (766, 0.10126842933820669),
 (792, 0.1226264667617829),
 (797, 0.09439267703990184),
 (802, 0.08877477835641555),
 (844, 0.22026563155998344),
 (859, 0.2831780311197055),
 (874, 0.14398450418535907),
 (875, 0.14398450418535907),
 (876, 0.14398450418535907),
 (877, 0.1439845041853

For the 77th document in our corpora, we see the token weights along with their token ids.

In [61]:
# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf[corpus[76]], key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

job 0.2831780311197055
generation 0.2452529335235658
automation 0.22026563155998344
machine 0.2191039188489769
authority 0.14398450418535907
