<a href="https://colab.research.google.com/github/lail-lei/tf-idf/blob/main/tf_idf_engine_with_keyword_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# first, import our libraries 
import pandas as pd

# need for importing text files from github
import requests

# Using regex
import re

# for unique id
import uuid

In [None]:
# helper function to process source document before indexing 
def tokenize (src):
  #simple tokenizer
  return re.sub(r'[^\w\s]', '', src.lower()).split()

class document ():

  document = None
  tokenized_document = None
  inverted_index = None # holds number of time word appears in document 
  term_frequency_index = None # holds tf of words in document (number of time word appears / total number of words)
  _id = None

  # helper function to create inverted index
  def invert (self):
    index = {}
    # populate dictionary with frequency of specific document's words
    # this is O(|D|) where D is words in a document  
    for word in self.tokenized_document:
      frequency = index.get(word, 0)
      index[word] = frequency + 1
    return index

  # helper function to create term frequency index
  def tf (self):
    index = {}
    totalNumberOfWords = len(self.tokenized_document)
    # this is O(|D|) where D is words in a document 
    for word, count in self.inverted_index.items():
      index[word] = count / float(totalNumberOfWords)
    self.term_frequency_index = index
    
  # constructor
  def __init__(self, document):
    self._id = uuid.uuid4()
    self.document = document
    self.tokenized_document = tokenize(document)
    self.inverted_index = self.invert() # holds term frequency in document 
    self.tf()
  

class searchEngine ():

  # a list of all documents in search engine 
  documets = None

  # an index containing the log of the number of documents 
  # divided by the number of documents that contain each specific word
  inverse_document_frequency_index = None

  # a data frame containing the TF-IDF scores for all the words in the engine
  data_frame = None

  # constructor
  def __init__(self):
    self.documents = []

  # compute the idf score across all documents in the corpus 
  def computeIDF(self):
    import math
    N = len(self.documents)
    index = {}
    # store the frequencies of all words across all documents (global frequency)
    # this is O(N*|D|) where N is number of documents in a corpus and D is the words in a document 
    for document in self.documents:
        for word, frequency in document.term_frequency_index.items():
            global_frequency = index.get(word, 0) # return 0 if doesn't exist in index
            # increment count 
            index[word] = global_frequency + 1
    # this is O(|T|)where T is all words in the corpus
    for word, global_frequency in index.items():
        # perform log of  (#documents/over the global frequency)
        index[word] = math.log(N / float(global_frequency))
    self.inverse_document_frequency_index = index;

  # compute the tfidf score for all words in a document 
  def computeTFIDF(self, document):
    index = {}
     # this is O(|D|)where D is all words in the document
    for word, frequency in document.term_frequency_index.items():
        # tf / idf 
        index[word] = frequency * self.inverse_document_frequency_index[word]
    return index

  # store the tfidf values for all documents in the corpus
  def createDataFrame (self):
    data = []
    rows = []
    # this is O(N * |D|) where D is all words in the document and N is the number of documents in a corpus
    for document in self.documents:
      index = self.computeTFIDF(document)
      rows.append(document._id)
      data.append(index)
    self.data_frame = pd.DataFrame(data=data, index=rows).fillna(0)
    
  # adds document to search engine
  def index (self, document): 
    # add document to documents list
    self.documents.append(document)
    # compute new idf (optimize later)
    self.computeIDF()
    # create data frame
    self.createDataFrame()

  # look up the tfidf score
  def lookup_tfidf (self, keyword):
    # return empty list if keyword doesn't exist in any document
    if keyword not in self.data_frame.columns:
      return []
    # get the rows (documents) in which the keyword exists
    rows = self.data_frame.index[self.data_frame[keyword] > 0].tolist()
    data = []
    # look up the scores for each row 
    for row in rows:
      score = self.data_frame.iloc[row][keyword]
      data.append({"id": row, "document": self.documents[row].document, "score": score})
    return data
    
  # combine duplictes/sum scores, and order according to relevancy 
  def process_results (self, lookup_list):
    results = {}
    # combine duplicates
    for item in lookup_list:
      if str(item["id"]) not in results:
        results[str(item["id"])] = item 
      else:
        results[str(item["id"])]["score"] += item["score"]
    results = results.values()
    # order in descending order 
    return sorted(results, key=lambda x: x["score"], reverse=True)
         

  # search! 
  def search (self, keyword):
    # process keyword into tokens 
    search_tokens = tokenize(keyword)
    # query the data frame
    results = []
    for token in search_tokens:
     lookup_list = self.lookup_tfidf(token)
     for item in lookup_list:
       results.append(item)
    return self.process_results(results)
  

  def keywords (self, _id, n):
    data = self.data_frame.loc[_id] # get tf_idf data by label
    sorted = data.sort_values(ascending=False)
    keys = sorted.keys()
    return keys[:n]


# Big of Indexing: O(N * |D|) + O(N * |D|) + O(T)
# = O(N * |D|) where N = number of documents, and D is words in a document

# Lucene uses optimized data structures to implement indexing 
# https://stackoverflow.com/questions/2602253/how-does-lucene-index-documents

  
