# TF-IDF Information Retrieval

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import itertools


# Some configuration

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# Create Bag of Words and List of Words. Source for sentences : https://simple.wikipedia.org

In [3]:
DATA = 'sentence'

if DATA == 'sentence':
    # Create Bag of Words from sentences
    # source : simple.wikipedia.org
    docs = [
        'Microsoft Windows is an operating system for computers made by the United States-based company Microsoft.',
        'macOS is the name of an operating system for computers made by Apple Inc.',
        'Linux or GNU/Linux is a Unix-like operating system (or strictly family of) for computers.',
        'Berkeley Software Distribution (BSD) is a kind of the UNIX operating system that is distributed for free.'
    ]
    stop_words = ['an', 'by', 'for', 'is', 'of', 'or', 'the']
    vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words)

    freqs = vectorizer.fit_transform(docs).toarray()
    words = vectorizer.get_feature_names()
    total_doc = len(docs)
else:
    # Use predetermined Bag of Words
    words = ['t1', 't2', 't3']
    freqs = [
        [6, 0, 0],  # doc1
        [0, 0, 3],  # doc2
        [7, 2, 0],  # doc3
        [5, 0, 1],  # doc4
    ]
    total_doc = len(freqs)


# TF (Term Frequency) Table

In [4]:
df = pd.DataFrame(columns=words, dtype=np.int32)
for i in range(len(freqs)):
    df.loc[i+1] = freqs[i]
df


Unnamed: 0,apple,based,berkeley,bsd,company,computers,distributed,distribution,family,free,gnu,inc,kind,like,linux,macos,made,microsoft,name,operating,software,states,strictly,system,that,united,unix,windows
1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,2,0,1,0,1,0,1,0,1,0,1
2,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,1,0,0,1,2,0,0,0,0,1,0,0,1,1,0,0,1,0
4,0,0,1,1,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0


# Normalized TF (Term Frequency) Table
Normalization formula
$$
\hat{\text{tf}} = \frac
{\text{tf}}
{\text{Total terms in a document}}
$$

In [5]:
tf = df.copy()
for c in tf.columns:
    for r in tf.index:
        tf_value = tf[c].loc[r] / df.loc[r].sum()
        tf[c].loc[r] = tf_value
tf



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,apple,based,berkeley,bsd,company,computers,distributed,distribution,family,free,gnu,inc,kind,like,linux,macos,made,microsoft,name,operating,software,states,strictly,system,that,united,unix,windows
1,0.0,0.090909,0.0,0.0,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.181818,0.0,0.090909,0.0,0.090909,0.0,0.090909,0.0,0.090909,0.0,0.090909
2,0.125,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.125,0.125,0.0,0.125,0.125,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.1,0.2,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.1,0.0,0.0,0.1,0.0
4,0.0,0.0,0.090909,0.090909,0.0,0.0,0.090909,0.090909,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.090909,0.0,0.0,0.090909,0.090909,0.0,0.090909,0.0


# IDF (Inverted Document Frequency) Table
Normalization formula :
$$
\text{idf} = 1 + \ln (\text{idf}) = 1 + \ln (\frac{\text{df}}{\text{Total documents}})
$$

P.S. `np.log` is natural logarithm, not base 2 or base 10 logarithm.

## 1. Check word is in document

In [6]:
idf = df.copy()
for c in idf.columns:
    idf[c] = idf[c].apply(lambda x: 0 if x == 0 else 1)
idf


Unnamed: 0,apple,based,berkeley,bsd,company,computers,distributed,distribution,family,free,gnu,inc,kind,like,linux,macos,made,microsoft,name,operating,software,states,strictly,system,that,united,unix,windows
1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,1
2,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0
4,0,0,1,1,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0


## 2. Calculate df (document frequency)

In [7]:
idf = idf.sum(axis=0)
idf


apple           1
based           1
berkeley        1
bsd             1
company         1
computers       3
distributed     1
distribution    1
family          1
free            1
gnu             1
inc             1
kind            1
like            1
linux           1
macos           1
made            2
microsoft       1
name            1
operating       4
software        1
states          1
strictly        1
system          4
that            1
united          1
unix            2
windows         1
dtype: int64

## 3. Calculate idf

In [8]:
idf = idf.apply(lambda x: total_doc / x)
idf


apple           4.000000
based           4.000000
berkeley        4.000000
bsd             4.000000
company         4.000000
computers       1.333333
distributed     4.000000
distribution    4.000000
family          4.000000
free            4.000000
gnu             4.000000
inc             4.000000
kind            4.000000
like            4.000000
linux           4.000000
macos           4.000000
made            2.000000
microsoft       4.000000
name            4.000000
operating       1.000000
software        4.000000
states          4.000000
strictly        4.000000
system          1.000000
that            4.000000
united          4.000000
unix            2.000000
windows         4.000000
dtype: float64

## 4. Calculate normalized idf

In [9]:
idf = idf.apply(lambda x: 1 + np.log(x))
idf


apple           2.386294
based           2.386294
berkeley        2.386294
bsd             2.386294
company         2.386294
computers       1.287682
distributed     2.386294
distribution    2.386294
family          2.386294
free            2.386294
gnu             2.386294
inc             2.386294
kind            2.386294
like            2.386294
linux           2.386294
macos           2.386294
made            1.693147
microsoft       2.386294
name            2.386294
operating       1.000000
software        2.386294
states          2.386294
strictly        2.386294
system          1.000000
that            2.386294
united          2.386294
unix            1.693147
windows         2.386294
dtype: float64

# Calculate TF-IDF

In [10]:
tf_idf = df.copy()
for c in tf_idf.columns:
    for r in tf_idf.index:
        tf_idf_value = tf[c].loc[r] * idf[c]
        tf_idf[c].loc[r] = tf_idf_value


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
tf_idf


Unnamed: 0,apple,based,berkeley,bsd,company,computers,distributed,distribution,family,free,gnu,inc,kind,like,linux,macos,made,microsoft,name,operating,software,states,strictly,system,that,united,unix,windows
1,0.0,0.216936,0.0,0.0,0.216936,0.117062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153922,0.433872,0.0,0.090909,0.0,0.216936,0.0,0.090909,0.0,0.216936,0.0,0.216936
2,0.298287,0.0,0.0,0.0,0.0,0.16096,0.0,0.0,0.0,0.0,0.0,0.298287,0.0,0.0,0.0,0.298287,0.211643,0.0,0.298287,0.125,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.128768,0.0,0.0,0.238629,0.0,0.238629,0.0,0.0,0.238629,0.477259,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.238629,0.1,0.0,0.0,0.169315,0.0
4,0.0,0.0,0.216936,0.216936,0.0,0.0,0.216936,0.216936,0.0,0.216936,0.0,0.0,0.216936,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.216936,0.0,0.0,0.090909,0.216936,0.0,0.153922,0.0


# Cosine similarity between documents

In [12]:
doc_pairs = itertools.combinations(tf_idf.index, 2)
doc_pairs = list(doc_pairs)
doc_pairs


[(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]

In [13]:
for d1, d2 in doc_pairs:
    dot_product = np.dot(tf_idf.loc[d1], tf_idf.loc[d2])

    d1_tf_idf = tf_idf.loc[d1].copy()
    d2_tf_idf = tf_idf.loc[d2].copy()

    square_d1 = d1_tf_idf.apply(lambda x: np.square(x))
    square_d2 = d2_tf_idf.apply(lambda x: np.square(x))

    abs_root_d1 = np.abs(np.sqrt(square_d1.sum()))
    abs_root_d2 = np.abs(np.sqrt(square_d2.sum()))
    cosine = dot_product / (abs_root_d1 * abs_root_d2)

    print(f'Cosine similarity between doc#{d1} and doc#{d2} = {cosine}')


Cosine similarity between doc#1 and doc#2 = 0.15858125557322786
Cosine similarity between doc#1 and doc#3 = 0.06668907888100811
Cosine similarity between doc#1 and doc#4 = 0.03705545418238183
Cosine similarity between doc#2 and doc#3 = 0.09364197895127023
Cosine similarity between doc#2 and doc#4 = 0.05203169872487982
Cosine similarity between doc#3 and doc#4 = 0.09497146844736447


# Cosine similarity with user's queries

In [14]:
def tf_idf_query(q):
    q_words = list(set(q))
    
    q_freq = []
    for i in range(len(q_words)):
        word_freq = q.count(q_words[i])
        q_freq.append(word_freq)

    q_tf = np.array(q_freq, dtype=np.float64)
    for i in range(len(q_tf)):
        q_tf[i] = q_tf[i] / sum(q_freq)

    q_idf = np.zeros(shape=(len(q_words),))
    for i in range(len(q_idf)):
        q_idf[i] = idf[q_words[i]]

    q_tf_idf = np.zeros(shape=(len(q_words),))
    for i in range(len(q_tf_idf)):
        q_tf_idf[i] = q_tf[i] * q_idf[i]

    # perform cosine with each doc
    for i in tf_idf.index:
        dot_product = np.dot(q_tf_idf, tf_idf[q_words].loc[i])

        d_tf_idf = tf_idf[q_words].loc[i].copy()

        square_d = d_tf_idf.apply(lambda x: np.square(x))
        square_q = np.zeros(shape=(len(q_tf_idf),))
        for j in range(len(square_q)):
            square_q[j] = np.square(q_tf_idf[j])

        root_d = np.sqrt(square_d.sum())
        root_q = np.sqrt(square_q.sum())

        abs_d = np.abs(root_d)
        abs_q = np.abs(root_q)

        cosine = dot_product / (abs_d * abs_q)

        # indicate one of the calculated value is 0
        if np.isnan(cosine):
            cosine = 0

        print(f'Cosine similarity between query and doc#{i} = {cosine}')
    print('='*70)


 If a query have bigger weight, type it multiple times in list

In [15]:
if DATA == 'sentence':
    queries = [
        ['windows', 'system'],
        ['linux', 'computers'],
        ['linux', 'linux', 'computers'],
        ['unix', 'operating', 'system']
    ]
else:
    queries = [
        ['life', 'learning']
    ]


In [16]:
for query in queries:
    print(f'{query=}')
    tf_idf_query(query)



query=['windows', 'system']
Cosine similarity between query and doc#1 = 1.0000000000000002
Cosine similarity between query and doc#2 = 0.38649523635839605
Cosine similarity between query and doc#3 = 0.38649523635839605
Cosine similarity between query and doc#4 = 0.38649523635839605
query=['linux', 'computers']
Cosine similarity between query and doc#1 = 0.47488709506083376
Cosine similarity between query and doc#2 = 0.4748870950608337
Cosine similarity between query and doc#3 = 0.9733685211656524
Cosine similarity between query and doc#4 = 0
query=['linux', 'linux', 'computers']
Cosine similarity between query and doc#1 = 0.2604929692792953
Cosine similarity between query and doc#2 = 0.2604929692792954
Cosine similarity between query and doc#3 = 1.0
Cosine similarity between query and doc#4 = 0
query=['unix', 'operating', 'system']
Cosine similarity between query and doc#1 = 0.6410554491745127
Cosine similarity between query and doc#2 = 0.6410554491745126
Cosine similarity between quer