In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
## Parsing

# Documents
with open('TIME.ALL', 'r') as f:
    text = f.read()
result = re.findall(r"\*TEXT\s+\d{3}\s+\d{2}/\d{2}(/|\s)\d{2}\s+PAGE\s+\d{3}\n\n(.+?)(?=\*TEXT|$)", text, re.DOTALL)
docDB = pd.DataFrame(result)
docDB.drop(0, axis=1, inplace=True)

# Queries
with open('TIME.QUE', 'r') as f:
    text = f.read()
result = re.findall(r'FIND\s+\d+\s+(.*)', text)
queryDF = pd.DataFrame(result)

# Stopwords
with open('TIME.STP','r') as f:
    text = f.read()
result = re.findall(r"^[A-Z]+$", text, re.MULTILINE)
swDF = pd.DataFrame(result)

# Relevant docs
with open('TIME.REL', 'r') as f:
    text = f.read()
    lines = text.split("\n")
rdDict = {}
for line in lines:
    numbers = re.findall(r"\d+", line)
    if numbers:
        key = numbers[0]
        values = [int(n) for n in numbers[1:]]
        rdDict[key] = values

In [3]:
import math
# Term Frequency
def tf(t, d):
    return d.split().count(t)/len(d.split())

# Document Frequency
def docfreq(t):
    termCount = 0
    for doc in docDB[1]:
        termCount += doc.split().count(t)
    return termCount

# Modified Inverse Document Frequency
def mod_idf(t):
    # Number of documents containing term t
    N = 0
    for doc in docDB[1]:
        if t in doc.split():
            N += 1
    return math.log((N + 1) / (0.5 + docfreq(t)))

# Given weight formula
# tf-idf(t, d) = tf(t, d) * mod_idf(t)
def tf_mod_idf(t, d):
    return tf(t, d) * mod_idf(t)

In [4]:
# # Has to be run during the first run in a new env
# import nltk 
# nltk.download('stopwords')
# nltk.download('punkt')

In [None]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stopWords = set(stopwords.words('english'))

docsMap = {str(idx):doc for idx, doc in enumerate(docDB[1])}

# Stop word removal
stopWordRemovedResult = {}
for title, content in docsMap.items():
    words = word_tokenize(content)
    filtered_words = [w for w in words if not w.lower() in stopWords and w != '.']
    stopWordRemovedResult[title] = filtered_words
stopWordRemovedResult
    
# Stemming
porter = PorterStemmer()
stemmedListMap = {}
for title, wordList in stopWordRemovedResult.items():
    stemmedWords = []
    for word in wordList:
        stemmedWord = porter.stem(word)
        stemmedWords.append(stemmedWord)
    stemmedListMap[title] = stemmedWords
stemmedListMap

# Joining
stemmedDB = {title: ' '.join(content) for title, content in stemmedListMap.items()}
allTerms = sorted(list(set([term for doc in stemmedListMap.values() for term in doc])))

In [7]:
from collections import defaultdict 

# Define a function to create an inverted index for the corpus
def create_inverted_index(corpus):
    invertedIndex = defaultdict(list)
    for docID, doc in corpus.items():
        for term in doc:
            invertedIndex[term].append((docID, tf_mod_idf(term, doc)))
    return invertedIndex
# Create the inverted index
invertedIndex = create_inverted_index(stemmedDB)
print(invertedIndex)

KeyboardInterrupt: 