In [90]:
import requests
url = 'https://gitlab.com/tangibleai/nlpia2/-/raw/main/src/nlpia2/ch03/bias_discrimination.txt'

bias_discrimination = requests.get(url).text

url = 'https://gitlab.com/tangibleai/nlpia2/-/raw/main/src/nlpia2/ch03/bias_intro.txt'
response = requests.get(url)

bias_intro = requests.get(url).text

In [91]:
import spacy
nlp = spacy.load('en_core_web_sm')

intro_tokens = [token.text for token in nlp(bias_intro)]
disc_tokens = [token.text for token in nlp(bias_discrimination)]

intro_total = len(intro_tokens)
disc_total = len(disc_tokens)

print(f'Intro tokens: {intro_total}')
print(f'Disc tokens: {disc_total}')

Intro tokens: 498
Disc tokens: 454


In [92]:
from collections import Counter


intro_tf = {}
disc_tf = {}
intro_counts = Counter(intro_tokens)
intro_tf['bias'] = intro_counts['bias'] / intro_total
disc_counts = Counter(disc_tokens)
disc_tf['bias'] = disc_counts['bias'] / disc_total

print('TF of "bias" in intro: {:.4f}'.format(intro_tf['bias']))
print('TF of "bias" in disc: {:.4f}'.format(disc_tf['bias']))
print('"bias" appears about {:.2f} times more in the intro than in the text'.format(intro_tf['bias'] / disc_tf['bias']))

TF of "bias" in intro: 0.0120
TF of "bias" in disc: 0.0022
"bias" appears about 5.47 times more in the intro than in the text


The term "bias" appears more in intro than in discrimination text. Does it mean the intro is more about "bias" than the text? Not really. Because if we examine the TF of the term "and":

In [93]:
intro_tf['and'] = intro_counts['and'] / intro_total
disc_tf['and'] = disc_counts['and'] / disc_total

print('TF of "and" in intro: {:.4f}'.format(intro_tf['and']))
print('TF of "and" in disc: {:.4f}'.format(disc_tf['and']))
print('"and" appears about {:.2f} times more in the intro than in the text'.format(intro_tf['and'] / disc_tf['and']))

TF of "and" in intro: 0.0281
TF of "and" in disc: 0.0110
"and" appears about 2.55 times more in the intro than in the text


A good way to think of a term’s inverse document frequency is this: How surprising is it that this token is in this document? The concept of measuring the surprise in a token might not sound like a very mathematical idea. However, in statistics, physics and information theory, the surprise of a symbol is used to measure its entropy or information content. And that is exactly what you need to gage the importance of a particular word. If a term appears in one document a lot of times, but occurs rarely in the rest of the corpus, it is a word that distinguishes that document’s meaning from the other documents.

In [94]:
intro_idf = {}
num_docs_containing_term = {}
num_docs = len([intro_tokens, disc_tokens])

terms = ["and", "bias", "black"]

for term in terms:
    for doc in [intro_tokens, disc_tokens]:
        if term in doc:
            if term in num_docs_containing_term:
                num_docs_containing_term[term] += 1
            else:
                num_docs_containing_term[term] = 1

for term in terms:
    intro_idf[term] = num_docs / num_docs_containing_term[term]

print("Intro IDF:")
for term, idf in intro_idf.items():
    print(f"{term}: {idf:.4f}")

Intro IDF:
and: 1.0000
bias: 1.0000
black: 2.0000


# Relevance ranking
First we build a vector of TF-IDF for all documents in the corpus

In [95]:
import copy
from nltk.tokenize import TreebankWordTokenizer

docs = [
    "The faster Harry got to the store, the faster and faster Harry would get home.",
    "Harry is hairy and faster than Jill.",
    "Jill is not as hairy as Harry.",
    "It takes 1 hour to get to the store.",
]
tokenizer = TreebankWordTokenizer()

doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]

zero_vector = copy.copy(intro_idf)
all_doc_tokens = sum(doc_tokens, [])
lexicon = sorted(set(all_doc_tokens))

from collections import OrderedDict
zero_vector = OrderedDict((token, 0) for token in lexicon)

document_tfidf_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    
    for key, value in token_counts.items():
        docs_containing_key = 0
        for _doc in docs:
            if key in _doc:
                docs_containing_key += 1
        tf = value / len(lexicon)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0
        vec[key] = tf * idf
    document_tfidf_vectors.append(vec)
document_tfidf_vectors

[OrderedDict([(',', 0.18181818181818182),
              ('.', 0.045454545454545456),
              ('1', 0),
              ('and', 0.09090909090909091),
              ('as', 0),
              ('faster', 0.2727272727272727),
              ('get', 0.09090909090909091),
              ('got', 0.18181818181818182),
              ('hairy', 0),
              ('harry', 0.0),
              ('home', 0.18181818181818182),
              ('hour', 0),
              ('is', 0),
              ('it', 0),
              ('jill', 0),
              ('not', 0),
              ('store', 0.09090909090909091),
              ('takes', 0),
              ('than', 0),
              ('the', 0.2727272727272727),
              ('to', 0.09090909090909091),
              ('would', 0.18181818181818182)]),
 OrderedDict([(',', 0),
              ('.', 0.045454545454545456),
              ('1', 0),
              ('and', 0.09090909090909091),
              ('as', 0),
              ('faster', 0.09090909090909091),
             

In [96]:
import copy
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]

all_doc_tokens = sum(doc_tokens, [])
lexicon = sorted(set(all_doc_tokens))

from collections import OrderedDict
zero_vector = OrderedDict((token, 0) for token in lexicon)

query = "How long does it take to get to the store?"
query_vec = copy.copy(zero_vector)

tokens = [token.text for token in nlp(query.lower())]
token_counts = Counter(tokens)

for token, count in token_counts.items():
    docs_containing_token = 0
    for _doc in docs:
        if token in _doc.lower():
            docs_containing_token += 1
    if docs_containing_token == 0:
        continue
    
    tf = count / len(tokens)
    idf = len(docs) / docs_containing_token
    query_vec[token] = tf * idf
    
print("Query vector:", query_vec)

Query vector: OrderedDict([(',', 0), ('.', 0), ('1', 0), ('and', 0), ('as', 0), ('faster', 0), ('get', 0.18181818181818182), ('got', 0), ('hairy', 0), ('harry', 0), ('home', 0), ('hour', 0), ('is', 0), ('it', 0.36363636363636365), ('jill', 0), ('not', 0), ('store', 0.18181818181818182), ('takes', 0), ('than', 0), ('the', 0.18181818181818182), ('to', 0.36363636363636365), ('would', 0), ('take', 0.36363636363636365)])


In [97]:
import math
def cosine_sim(vec1, vec2):
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    
    dot_prod =0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    
    return dot_prod / (mag_1 * mag_2)

In [98]:
cosine_sims = [cosine_sim(doc_vec, query_vec) for doc_vec in document_tfidf_vectors]
print(cosine_sims)

highest_score = max(cosine_sims)
highest_score_index = cosine_sims.index(highest_score)
print("Query:", query)
print("Match:", docs[highest_score_index])

[0.29223800250640314, 0.0, 0.0, 0.41194292043554986]
Query: How long does it take to get to the store?
Match: It takes 1 hour to get to the store.


## A faster way to tokenizer documents for TF-IDF

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = docs # type: ignore
vectorizer = TfidfVectorizer(min_df=1)
vectorizer = vectorizer.fit(corpus)
vectors = vectorizer.transform(corpus)

vectors.todense().round(2)

array([[0.18, 0.  , 0.55, 0.18, 0.23, 0.  , 0.3 , 0.23, 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.18, 0.  , 0.  , 0.55, 0.18, 0.23],
       [0.37, 0.  , 0.37, 0.  , 0.  , 0.37, 0.3 , 0.  , 0.  , 0.37, 0.  ,
        0.37, 0.  , 0.  , 0.  , 0.47, 0.  , 0.  , 0.  ],
       [0.  , 0.74, 0.  , 0.  , 0.  , 0.29, 0.24, 0.  , 0.  , 0.29, 0.  ,
        0.29, 0.37, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.29, 0.  , 0.  , 0.  , 0.  , 0.37, 0.  , 0.37,
        0.  , 0.  , 0.29, 0.37, 0.  , 0.29, 0.58, 0.  ]])

In [100]:
query_vec = vectorizer.transform([query])
query_vec.todense().round(2)

array([[0.  , 0.  , 0.  , 0.34, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.43,
        0.  , 0.  , 0.34, 0.  , 0.  , 0.34, 0.68, 0.  ]])

In [101]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sims = cosine_similarity(vectors, query_vec)
cosine_sims

array([[0.43964217],
       [0.        ],
       [0.        ],
       [0.85319029]])