# 3-1. **Count-based Representation**

In [9]:
!pip install -q nltk

In [10]:
import re
import math
import numpy as np
from collections import Counter, defaultdict

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords as sw
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
corpus = [
    'Kim loves the NLP. The NLP hates Kim', # document 0
    'Kim hates the NLP' # document 1
]

Pre-process the documents

In [None]:
stop_words = sw.words('english')
lemmatizer = WordNetLemmatizer()

tokenized_docs = []
for doc_id, doc in enumerate(corpus):
    print('doc_id:', doc_id)
    # remove punctuations
    puct_removed = re.sub(r'[^\w\s]','',doc)
    print('punctuations removed:', puct_removed)
    # tokenize words
    tokenized = word_tokenize(puct_removed)
    print('tokenized:', tokenized)
    # case folding
    lowered = [t.lower() for t in tokenized]
    print('case-folded:', lowered)
    # lemmatization
    lemmatized = [lemmatizer.lemmatize(t) for t in lowered]
    print('lemmatized:', lemmatized)
    # stop word removal
    tokenized_doc = [w for w in lemmatized if not w in stop_words]
    print('stopwords removed:', tokenized_doc)
    tokenized_docs.append(tokenized_doc)

doc_id: 0
punctuations removed: Kim loves the NLP The NLP hates Kim
tokenized: ['Kim', 'loves', 'the', 'NLP', 'The', 'NLP', 'hates', 'Kim']
case-folded: ['kim', 'loves', 'the', 'nlp', 'the', 'nlp', 'hates', 'kim']
lemmatized: ['kim', 'love', 'the', 'nlp', 'the', 'nlp', 'hate', 'kim']
stopwords removed: ['kim', 'love', 'nlp', 'nlp', 'hate', 'kim']
doc_id: 1
punctuations removed: Kim hates the NLP
tokenized: ['Kim', 'hates', 'the', 'NLP']
case-folded: ['kim', 'hates', 'the', 'nlp']
lemmatized: ['kim', 'hate', 'the', 'nlp']
stopwords removed: ['kim', 'hate', 'nlp']


In [None]:
tokenized_docs

[['kim', 'love', 'nlp', 'nlp', 'hate', 'kim'], ['kim', 'hate', 'nlp']]

## One-hot encoding

In [None]:
token_set = set()
for tokenized_doc in tokenized_docs:
    for term in tokenized_doc:
        token_set.add(term)

In [None]:
vocab = {word : index for index, word in enumerate(token_set)}
vocab

{'love': 0, 'hate': 1, 'nlp': 2, 'kim': 3}

In [None]:
one_hot_vectors = {}
for word, index in vocab.items():
    one_hot_vector = [0]*(len(vocab))
    one_hot_vector[index] = 1
    one_hot_vectors[word] = one_hot_vector
one_hot_vectors

{'love': [1, 0, 0, 0],
 'hate': [0, 1, 0, 0],
 'nlp': [0, 0, 1, 0],
 'kim': [0, 0, 0, 1]}

## Bag-of-Words

In [None]:
bow_docs = {}
for doc_id, tokenized_doc in enumerate(tokenized_docs):
    counter = Counter(tokenized_doc)
    bow_doc = {}
    for word in vocab:
        bow_doc[word] = counter.get(word, 0)
    bow_docs[doc_id] = bow_doc
bow_docs

{0: {'love': 1, 'hate': 1, 'nlp': 2, 'kim': 2},
 1: {'love': 0, 'hate': 1, 'nlp': 1, 'kim': 1}}

## TFIDF (Term Frequency Inverse Document Frequency)

**Document Frequency (DF)**

- *df(t) = occurrence of t in documents*

- idf(t) = log(N/(df + 1))



In [None]:
dfs = defaultdict(int)
for tokenized_doc in tokenized_docs:
    # get each unique word in the doc - we need to know whether the word is appeared in the document
    for term in np.unique(tokenized_doc):
        dfs[term] +=1
dfs

defaultdict(int, {'hate': 2, 'kim': 2, 'love': 1, 'nlp': 2})

**TF-IDF calculation**

In [None]:
N = len(tokenized_docs)

tf_idfs = {}
for doc_id, tokenized_doc in enumerate(tokenized_docs):
    counter = Counter(tokenized_doc)
    total_num_words = len(tokenized_doc)
    for term in np.unique(tokenized_doc):
        tf = counter.get(term, 0)/total_num_words
        df = dfs[term]
        idf = math.log(N/(df+1))+1 # add 1 not to be negative
        tf_idfs[doc_id, term] = round(tf*idf, 3)
tf_idfs

{(0, 'hate'): 0.099,
 (0, 'kim'): 0.198,
 (0, 'love'): 0.167,
 (0, 'nlp'): 0.198,
 (1, 'hate'): 0.198,
 (1, 'kim'): 0.198,
 (1, 'nlp'): 0.198}

**Sort by the importance - Descending Order**

In [None]:
#sort the dictionary based on values
sorted_list = sorted(tf_idfs.items(), key=lambda x: x[1], reverse=True)
sorted_list

[((0, 'kim'), 0.198),
 ((0, 'nlp'), 0.198),
 ((1, 'hate'), 0.198),
 ((1, 'kim'), 0.198),
 ((1, 'nlp'), 0.198),
 ((0, 'love'), 0.167),
 ((0, 'hate'), 0.099)]

# Exercise

**Write a function which returns the top N (e.g. 10 or 20) words with the largest tf value and with the largest tfidf values for a paragraph of corpus (Wikipedia page).**

In [1]:
!pip install -q wikipedia
import wikipedia

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


In [26]:
animals = ['Kangaroo', 'Quokka', 'Capybara']
corpus = [wikipedia.page(animal).content for animal in animals]

In [27]:
# pass both the corpus and the number of words to focus into the function
# In the function, you may:
# - process the corpus (e.g. tokenization) - provided
# - calculate the tf/tfidf values for each unique words
# - sorting the words based on the tf/tfidf values
# - print out the top n tf words and tfidf words from the sorted list in parallel for comparison

def get_tf_and_idf(corpus, top_n):
    # Process the corpus (incl. tokenisation, lower_case, stopword removal)
    tokenized_docs = []
    for doc in corpus:
        puct_removed = re.sub(r'[^\w\s]','',doc)
        tokenized = word_tokenize(puct_removed)
        lowered = [t.lower() for t in tokenized]
        lemmatized = [lemmatizer.lemmatize(t) for t in lowered]
        tokenized_doc = [w for w in lemmatized if not w in stop_words]
        tokenized_docs.append(tokenized_doc)

    # Please complete this

get_tf_and_idf(corpus, 10)

#Expected Outcomes are as follows:

Total docs in corpus: 3

Top 10 of tf values:
(doc id, word): tf
(0, 'kangaroo') 0.051
(2, 'capybara') 0.05
(1, 'quokkas') 0.03
(1, 'island') 0.021
(1, 'quokka') 0.021
(2, 'male') 0.012
(0, 'male') 0.011
(1, 'ha') 0.011
(1, 'population') 0.011
(1, 'rottnest') 0.011

Top 10 of tfidf values:
(doc id, word): tf*idf
(2, 'capybara') 0.07
(0, 'kangaroo') 0.051
(1, 'quokkas') 0.042
(1, 'quokka') 0.03
(1, 'island') 0.021
(1, 'rottnest') 0.016
(1, 'mainland') 0.012
(2, 'male') 0.012
(0, 'male') 0.011
(1, 'small') 0.011
