Krishna Sharma | AP22110010128

In [1]:
import math
import re
from collections import Counter

In [2]:
with open('space.txt', 'r', encoding='utf-8') as file:
    documents = file.readlines()

In [3]:
cleaned_docs = []
for doc in documents:
    doc = doc.lower()
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    cleaned_docs.append(doc.strip())

In [4]:
print(f"Number of documents: {len(cleaned_docs)}")
print(f"First document preview: {cleaned_docs[0][:100]}...")

Number of documents: 475
First document preview: archivename spaceacronyms...


VECTOR SPACE MODEL IMPLEMENTATION

In [5]:
vocabulary = set()
for doc in cleaned_docs:
    words = doc.split()
    vocabulary.update(words)

vocabulary = list(vocabulary)
vocabulary.sort()
print(f"Vocabulary size: {len(vocabulary)}")

Vocabulary size: 1216


In [6]:
tf_matrix = []
for i, doc in enumerate(cleaned_docs):
    words = doc.split()
    word_count = len(words)
    tf_vector = []
    
    for term in vocabulary:
        term_freq = words.count(term)
        tf = term_freq / word_count if word_count > 0 else 0
        tf_vector.append(tf)
    
    tf_matrix.append(tf_vector)

print(f"TF matrix created with shape: {len(tf_matrix)} x {len(vocabulary)}")

TF matrix created with shape: 475 x 1216


In [9]:
df_vector = []
total_docs = len(cleaned_docs)

for term in vocabulary:
    df = 0
    for doc in cleaned_docs:
        if term in doc.split():
            df += 1
    df_vector.append(df)

idf_vector = []
for df in df_vector:
    if df > 0:
        idf = math.log(total_docs / df)
    else:
        idf = 0
    idf_vector.append(idf)

print("IDF calculated for all terms")

IDF calculated for all terms


In [12]:
tfidf_matrix = []
for doc_tf in tf_matrix:
    tfidf_vector = []
    for i, tf in enumerate(doc_tf):
        tfidf = tf * idf_vector[i]
        tfidf_vector.append(tfidf)
    tfidf_matrix.append(tfidf_vector)

In [13]:
query = "space exploration mars mission" 
query = query.lower()
query = re.sub(r'[^a-zA-Z\s]', '', query)
query_words = query.split()

query_vector = []
for term in vocabulary:
    if term in query_words:
        query_vector.append(1)
    else:
        query_vector.append(0)

In [14]:
cosine_similarities = []
for doc_vector in tfidf_matrix:
    dot_product = 0
    for i in range(len(vocabulary)):
        dot_product += query_vector[i] * doc_vector[i]
    
    query_magnitude = 0
    doc_magnitude = 0
    for i in range(len(vocabulary)):
        query_magnitude += query_vector[i] ** 2
        doc_magnitude += doc_vector[i] ** 2
    
    query_magnitude = math.sqrt(query_magnitude)
    doc_magnitude = math.sqrt(doc_magnitude)
    
    if query_magnitude > 0 and doc_magnitude > 0:
        cosine_sim = dot_product / (query_magnitude * doc_magnitude)
    else:
        cosine_sim = 0
    
    cosine_similarities.append(cosine_sim)

In [15]:
doc_similarities = list(enumerate(cosine_similarities))
doc_similarities.sort(key=lambda x: x[1], reverse=True)

for i in range(min(5, len(doc_similarities))):
    doc_idx, similarity = doc_similarities[i]
    print(f"Document {doc_idx + 1}: Similarity = {similarity:.4f}")
    print(f"Content: {cleaned_docs[doc_idx][:100]}...")
    print()

Document 333: Similarity = 0.4201
Content: sei space exploration initiative...

Document 248: Similarity = 0.3257
Content: mrsrm mars rover and sample return mission...

Document 241: Similarity = 0.2834
Content: moc mars observer camera on mars observer...

Document 243: Similarity = 0.2535
Content: mola mars observer laser altimeter on mars observer...

Document 235: Similarity = 0.2087
Content: mcc mission control center...



BINARY INDEPENDENCE MODEL

In [16]:
binary_matrix = []
for doc in cleaned_docs:
    words = set(doc.split())
    binary_vector = []
    
    for term in vocabulary:
        if term in words:
            binary_vector.append(1)
        else:
            binary_vector.append(0)
    
    binary_matrix.append(binary_vector)

print(f"Binary matrix created with shape: {len(binary_matrix)} x {len(vocabulary)}")

Binary matrix created with shape: 475 x 1216


In [17]:
relevant_threshold = max(1, len(cleaned_docs) // 5)

p_term_relevant = []
p_term_not_relevant = []

for term_idx in range(len(vocabulary)):
    docs_with_term = 0
    for doc_vector in binary_matrix:
        if doc_vector[term_idx] == 1:
            docs_with_term += 1

    p_rel = min(0.9, max(0.1, docs_with_term / total_docs + 0.1))
    p_term_relevant.append(p_rel)
    
    p_not_rel = max(0.1, docs_with_term / total_docs - 0.1)
    p_term_not_relevant.append(p_not_rel)

In [18]:
query_binary = []
for term in vocabulary:
    if term in query_words:
        query_binary.append(1)
    else:
        query_binary.append(0)

In [19]:
bim_scores = []
for doc_vector in binary_matrix:
    score = 0
    
    for i in range(len(vocabulary)):
        if query_binary[i] == 1:  
            if doc_vector[i] == 1:  
                p_rel = p_term_relevant[i]
                p_not_rel = p_term_not_relevant[i]
                
                if p_not_rel > 0 and p_rel < 1:
                    odds_ratio = (p_rel * (1 - p_not_rel)) / (p_not_rel * (1 - p_rel))
                    if odds_ratio > 0:
                        score += math.log(odds_ratio)
    
    bim_scores.append(score)

In [20]:
doc_bim_scores = list(enumerate(bim_scores))
doc_bim_scores.sort(key=lambda x: x[1], reverse=True)

for i in range(min(5, len(doc_bim_scores))):
    doc_idx, score = doc_bim_scores[i]
    print(f"Document {doc_idx + 1}: BIM Score = {score:.4f}")
    print(f"Content: {cleaned_docs[doc_idx][:100]}...")

Document 333: BIM Score = 0.6753
Content: sei space exploration initiative...
Document 13: BIM Score = 0.6521
Content: every acronym i ever saw id soon run out of disk space...
Document 30: BIM Score = 0.6521
Content: reference publication  revised space transportation system and...
Document 69: BIM Score = 0.6521
Content: awst aviation week and space technology aka avleak...
Document 76: BIM Score = 0.6521
Content: bnsc british national space centre...


Krishna Sharma | AP22110010128