In [None]:
def build_direct_index(filename):

  f = open(filename)
  unique_words = set()
  word_frequencies = dict()

  stop_words = ['in','the','a','an','and']
  for line in f:
    words = line.split()
    for word in words:
      w = word.lower().strip('?!,;.-\"\'')
      if w not in stop_words:
        unique_words.add(w)

        if w not in word_frequencies:
          word_frequencies[w] = 1
        else:
          word_frequencies[w] = word_frequencies[w] + 1

  return word_frequencies


def build_batch_direct_reverse_indices(*filenames):

  direct_indices = dict()

  for filename in filenames:
    direct_indices[filename] = build_direct_index(filename)

  filenames = direct_indices.keys()
  reversed_indices = dict()

  for filename in filenames:
    words = direct_indices[filename]
    for word in words:
      if word not in reversed_indices:
        reversed_indices[word] = []

      word_freq = direct_indices[filename][word]
      reversed_indices[word].append((filename, word_freq))

  return direct_indices, reversed_indices

direct_indices, reversed_indices = build_batch_direct_reverse_indices('nicole.txt','kevin.txt')

# direct_indices.keys()
# reversed_indices

In [None]:

def query1(direct_indices, w):

  filenames = []

  for key, val in direct_indices.items():
    if w in direct_indices[key]:
      filenames.append(key)
      break

  return filenames


def query3(direct_indices, words):
  filenames = []
  for filename, freq in direct_indices.items():
    if all(word in freq for word in words):
      filenames.append(filename)

  return filenames

# sample_indices = {'kevin.txt':{'is':1,'it':2, 'hejaz':1}, 'nicole.txt':{'invite':2, 'inside':1, 'it':1, 'is':2}}
# # assert query3(direct_indices, ('is', 'it')) == ['nicole.txt', 'kevin.txt']
# assert query3( sample_indices, ('is', 'it')) == {'nicole.txt', 'kevin.txt'}
# assert query3(sample_indices, ('is', 'this')) == set()
# assert query3(sample_indices, ('is', 'it', 'hejaz')) == {'kevin.txt'}
# assert query1(direct_indexes, 'inside') == ['kevin.txt']
# query3({'kevin.txt':{'is':1,'it':2, 'hejaz':1}, 'nicole.txt':{'invite':2, 'inside':1, 'it':1, 'is':2}}, ('is', 'it'))


## Comparing the time of implementing query3 and query1

In [None]:
import datetime


direct_indices, reversed_indices = build_batch_direct_reverse_indices('nicole.txt','nicole.txt','nicole.txt')

# checking the time query3 takes to process two words as a tuple
before = datetime.datetime.now()
for i in range(0,1000000):
  query3(direct_indices, ('possibly', 'different'))
after = datetime.datetime.now()
print(f"Query 3 version 1 took: {(after-before)} milliseconds")


# checking the time query1 takes to process two words seperately
before = datetime.datetime.now()
for i in range(0,1000000):
  filename1 = query1(direct_indices, 'possibly')
  filename2 = query1(direct_indices, 'different')
  list(set(filename1).intersection(filename2))
after = datetime.datetime.now()
print(f"Query 1 version 1 took: {(after-before)} milliseconds")


Query 3 version 1 took: 0:00:00.943093 milliseconds
Query 1 version 1 took: 0:00:01.532264 milliseconds
