# Normalized term frequency

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

sentence = 'It has also arisen in criminal justice, healthcare, and hiring, compounding existing racial, economic, and gender biases.'
doc = nlp(sentence)
tokens = [token.text for token in doc]

from collections import Counter
bag_of_words = Counter(tokens)

import pandas as pd
most_common = dict(bag_of_words.most_common())
counts = pd.Series(most_common)
print(counts.shape)
counts.head()


(18,)


,       5
and     2
It      1
has     1
also    1
dtype: int64

Normalized term frequency of a word is calculated as

In [14]:
counts['justice'] / counts.sum()
# 4% ish

0.043478260869565216

## Term frequencies and their relevance

In [15]:
import requests

url = ('https://gitlab.com/tangibleai/nlpia2/-/raw/main/src/nlpia2/ch03/bias_intro.txt')
response = requests.get(url)

bias_intro = response.text

tokens = [token.text for token in nlp(bias_intro)]
counts = Counter(tokens)

In [16]:
print('most common: ', counts.most_common()[0:5])
print('least common: ', counts.most_common()[-5:])

most common:  [(',', 35), ('of', 16), ('.', 16), ('to', 15), ('and', 14)]
least common:  [('programs', 1), ('inputs', 1), ('between', 1), ('same', 1), ('service', 1)]


## Building bag-of-words

In [17]:
docs = [nlp(s) for s in bias_intro.split('\n') if s.strip()]
bows = []

for doc in docs:
    tokens = [token.text for token in doc]
    bow = Counter(tokens)
    bows.append(bow)
    
df = pd.DataFrame(bows).fillna(0).astype(int)
df.head()

Unnamed: 0,Algorithmic,bias,describes,systematic,and,repeatable,errors,in,a,computer,...,there,no,examine,network,interrelated,programs,inputs,between,same,service
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Building vocabulary

In [21]:
doc_tokens  = []
for doc in docs:
    doc_tokens.append([ tok.text.lower() for tok in nlp(doc.text)])
print('length of doc_tokens:', len(doc_tokens))

all_doc_tokens = []
for tokens in doc_tokens:
    all_doc_tokens.extend(tokens)
print('tokens count: ', len(all_doc_tokens))

vocab = set(all_doc_tokens)
vocab = sorted(vocab)
print('vocab size:', len(vocab))

print('on average, each token is used at least', round(len(all_doc_tokens) / len(vocab)), 'times')

length of doc_tokens: 16
tokens count:  482
vocab size: 246
on average, each token is used at least 2 times


# Vectorizing documents using `CounterVectorizer`

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [doc.text for doc in docs]
vectorizer = CountVectorizer()
count_vectors = vectorizer.fit_transform(corpus)

print(count_vectors.shape)
count_vectors.toarray()

(16, 240)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)