In [None]:
import numpy as np
import pandas as pd
import re
from nltk.stem import PorterStemmer
import math
import collections

In [None]:
## Parsing
# Documents
with open('TIME.ALL', 'r') as f:
    text = f.read()
result = re.findall(r"\*TEXT\s+(\d{3})\s+(\d{2}/\d{2}(/|\s)\d{2})\s+PAGE\s+(\d{3})\n\n(.+?)(?=\*TEXT|$)", text, re.DOTALL)
docDB = {match[0]: {'id': match[0], 'date': match[1], 'page': int(match[3]), 'text': match[4]} for match in result}
docOnlyDB = {match[0]: match[4] for match in result}
docDF = pd.DataFrame(docDB).T

# Queries
with open('TIME.QUE', 'r') as f:
    text = f.read()
queryDB = re.findall(r'FIND\s+\d+\s+(.+?)(?=\n\n\*FIND\s+\d+\s+|$)', text, re.DOTALL)

# Stopwords
with open('TIME.STP','r') as f:
    text = f.read()
swDB = re.findall(r"^[A-Z]+$", text, re.MULTILINE)
swDB = set([word.lower() for word in swDB])

# Relevant docs
with open('TIME.REL', 'r') as f:
    text = f.read()
    lines = text.split("\n")
rdDB = {}
for line in lines:
    numbers = re.findall(r"\d+", line)
    if numbers:
        key = numbers[0]
        values = [int(n) for n in numbers[1:]]
        rdDB[key] = values

In [None]:
# Tokenize a document
def tokenizeDocument(documentText):
    return documentText.split()

# Normalize and Stop the token stream of a document
def normalizeAndStopTokenStream(tokenStream):
    return [token.lower() for token in tokenStream if token.lower().isalnum() and token.lower() not in swDB]

# Stem and the normalized token stream of a document
def stemNormalizedTokenStream(normalizedTokenStream):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(token) for token in normalizedTokenStream])

def processDocument(documentText):
    return stemNormalizedTokenStream(normalizeAndStopTokenStream(tokenizeDocument(documentText)))

In [None]:
processedDocDB = {docID: processDocument(text) for docID, text in docOnlyDB.items()}
processedDocDB

In [None]:
allTerms = sorted(list(set([token for doc in processedDocDB.values() for token in doc.split()])))
allTerms

In [None]:
# No. of documents a term appears in the DB
def df(term, documentDB):
    return len([1 for document in documentDB if term in document])

# Inverse document frequency or informativeness of a term
def idf(term, documentDB):
    return math.log((len(documentDB) + 1) / (df(term, documentDB) + 0.5))

# Precalculating and making a cache
idfMap = {term:idf(term, processedDocDB.values()) for term in allTerms}
def getIDF(term):
    return idfMap[term]

# Calculate weights for a document
def calculateDocumentWeight(document):
    counter = collections.Counter(document.split())
    return [counter.get(term, 0) * getIDF(term) for term in allTerms]

In [None]:
tdMatrixDF = pd.DataFrame([calculateDocumentWeight(processedDoc) for processedDoc in processedDocDB.values()], columns=allTerms, index=processedDocDB.keys())

In [None]:
processedQueryDB = [processDocument(query) for query in queryDB]
processedQueryDB

In [None]:
def cosineSimiliary(queryWeights, tdMatrix):
    return np.dot(tdMatrix, queryWeights) / (np.linalg.norm(queryWeights) * np.linalg.norm(tdMatrix, axis=1))

def findTopNRelevantDocsWithCosineSimilarity(query, tdMatrixDF, N):
    # Calculate the cosine similarity
    cosineSimilarities = cosineSimiliary(calculateDocumentWeight(query), tdMatrixDF.values)

    # Sort in descending order of cosine similarity
    df = pd.DataFrame({'docID': tdMatrixDF.index, 'cosineSimilarity': cosineSimilarities})
    sorted_df = df.sort_values(by='cosineSimilarity', ascending=False)

    # Return the top 10 relevant documents from the sorted dataframe
    return sorted_df['docID'].values[:N].tolist()

In [None]:
cosineSimiliaryResults = {str(idx + 1):findTopNRelevantDocsWithCosineSimilarity(processedQuery, tdMatrixDF, 10) for idx, processedQuery in enumerate(processedQueryDB)}
cosineSimiliaryResults

# Arranging the TDMatrix

In [None]:
tdMatrixDF = tdMatrixDF.T
tdMatrixDF

In [None]:
U = pd.DataFrame(np.dot(tdMatrixDF.T, tdMatrixDF))
V = pd.DataFrame(np.dot(tdMatrixDF, tdMatrixDF.T))

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(U)
singular_values = np.sqrt(eigenvalues)

Sigma = np.diag(singular_values)
Sigma