# Term Weighting | tf-idf

In [None]:
!pip install nltk

Collecting nltk

You should consider upgrading via the 'c:\users\gitsa\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.



  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2022.3.15-cp39-cp39-win_amd64.whl (274 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.3.15


**Loading Modules**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import json
import string
import re
import math
from nltk.stem import PorterStemmer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Preprocessing Function**

In [None]:
def preprocess(input):
    input=input.replace('\a',' ')
    input=input.replace('\b',' ')
    input=input.replace('\f',' ')
    input=input.replace('\n',' ')    
    input=input.replace('\r',' ')
    input=input.replace('\t',' ')
    input=input.replace('\v',' ')
    # convert to lower case
    output = input.lower()
    # remove punctuations
    punctuations=string.punctuation.replace("'",'')
    output = "".join([char if char not in punctuations else ' ' for char in output])
    output = output.replace("'",'')
    # tokenize
    output = nltk.word_tokenize(output)
    # removing words with special characters
    output = [word for word in output if re.sub(r'[^\x20-\x7e]','',word) == word]
    # remove stopwords and numeric tokens
    output = [word.strip() for word in output if word not in nltk.corpus.stopwords.words('english') and not word.isnumeric()]
    # stemming
    # output = [PorterStemmer().stem(word) for word in output]
    return output

In [None]:
# preprocessing
for doc in raw_data:
    raw_data[doc] = preprocess(raw_data[doc])

**Loading already preprocessed file**

In [None]:
raw_data = json.load(open("PycharmProjects/class12/preprocessed.json", "r"))

**doc to doc-id mapping**

In [None]:
def map_docs(raw_data):
    doc_ids = {}
    id = 1
    for doc in raw_data:
        doc_ids[doc] = id
        id += 1
    return doc_ids

doc_ids = map_docs(raw_data)

**Creating unigram inverted index**


In [None]:
def create_index(doc_ids):
    index = {}
    for doc in doc_ids:
        for token in raw_data[doc]:
            # if token exists in index, add doc id
            if token in index.keys():
                index[token][1].add(doc_ids[doc])
                index[token][0] = len(index[token][1])
            # if token does not exist in index, add to index
            else:
                index[token] = [1, {doc_ids[doc]}]
    return index

index = create_index(doc_ids)

**Finding idf values of words**

In [None]:
idfValue={}
totalLen=len(doc_ids)
for word in index:
    idfValue[word] = math.log2(totalLen/(index[word][0]+1))

Storing terms of a doc in dictionary manner for easier access, storing doc length

In [None]:
doc_terms={}
doc_len={}
ctr=0
for doc in raw_data:
    ctr+=1
    terms=raw_data[doc]
    setterms=set(terms)
    temp_dict={}
    for word in setterms:
        temp_dict[word]=terms.count(word)
    # print(ctr,len(raw_data[doc]))
    doc_terms[doc_ids[doc]]=temp_dict
    doc_len[doc_ids[doc]]=len(terms)

Storing max frequency in a doc for double normalization

In [None]:
doc_max={}
for doc in doc_terms:
    maxa=0
    for word in doc_terms[doc]:
        maxa=max(maxa,doc_terms[doc][word])
    doc_max[doc]=maxa

Finding tf-idf values by iterating over every word for all docs

In [None]:
def tfvalue(docs, idf, doc_len,doc_max):
    bin = {}
    for doc in docs:
        bin[doc] = {}
    ctr = 0
    for word in idf:
        idfval = idf[word]
        for doc in docs:
            if word in docs[doc].keys():
                val = docs[doc][word]
                bin[doc][word] = [idfval, val * idfval, val * idfval / doc_len[doc], math.log2(1 + val) * idfval,
                                  idfval*(float((0.5+0.5*(val)/doc_max[doc])))]
            else:
                bin[doc][word] = [0, 0, 0, 0, float(0.5*idfval)]
        ctr += 1
        if ctr % 1000 == 0:
            print(ctr)
    return bin

In [None]:
tfidf =tfvalue(doc_terms,idfValue,doc_len,doc_max)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000


Functions to take input a query, preprocess it, and find its tf-idf value and rank documents

In [None]:
def scoring(query,doc,tfidf,j):
    val=0
    for word in query:
        val+=tfidf[doc][word][j]
    return val

In [None]:
def process(input,j):
    global tfidf,doc_ids
    terms = preprocess(input)
    query = set(terms)

    best_docs = []
    for doc in doc_ids:
        coeff = scoring(query, doc_ids[doc],tfidf,j)
        best_docs.append([coeff, doc])
    # finding 5 best documents
    best_docs.sort(reverse = True)
    return best_docs[:5]

In [None]:

format={0:'Binary Scheme',1:'Raw count Scheme',2:'Term frequency Scheme',3:'Log normalization Scheme',4:'Double normalization Scheme'}
# input and output
def run():
    n=int(input('Enter number of queries:'))
    for i in range(n):
        query = input("Input query: ")
        for j in range(5):
            result = process(query,j)
            print('\nFor',format[j])
            print(result)
            print('Top 5 documents:')
            for r in result:
                print(r[1])

In [None]:
run()

Enter number of queries: 1
Input query:  first aid the



For Binary Scheme
[[4.791458074186427, 'woodbugs.txt'], [4.791458074186427, 'wagit.txt'], [4.791458074186427, 'urban.txt'], [4.791458074186427, 'turbo.hum'], [4.791458074186427, 'tfepisod.hum']]
Top 5 documents:
woodbugs.txt
wagit.txt
urban.txt
turbo.hum
tfepisod.hum

For Raw count Scheme
[[74.58699718057298, 'mlverb.hum'], [48.12993557780939, 'practica.txt'], [40.37164436755816, 'hackingcracking.txt'], [39.68922762737971, 'candy.txt'], [34.89776955319328, 'humor9.txt']]
Top 5 documents:
mlverb.hum
practica.txt
hackingcracking.txt
candy.txt
humor9.txt

For Term frequency Scheme
[[0.05578241921279136, 'hum2'], [0.04395833095583879, '1st_aid.txt'], [0.0329751494841258, 'whoon1st.hum'], [0.0329751494841258, 'abbott.txt'], [0.03205905458781499, 'labels.txt']]
Top 5 documents:
hum2
1st_aid.txt
whoon1st.hum
abbott.txt
labels.txt

For Log normalization Scheme
[[13.350112533012803, 'mlverb.hum'], [12.919408028890427, 'insult.lst'], [12.646052707706064, 'practica.txt'], [10.864887361886442, 'c