In [1]:
import os
import re

def extract_text(file):
    with open(file, 'rb') as f:
        text = f.read().decode('utf-8', errors='ignore')
    title = re.search(r'<TITLE>(.*?)</TITLE>', text, re.DOTALL)
    text = re.search(r'<TEXT>(.*?)</TEXT>', text, re.DOTALL)
    if title and text:
        final_text = title.group(1) + ' ' + text.group(1)
        with open(file, 'w') as f:
            f.write(final_text)

dataset_path = 'C:/Users/kabir/IIITD/IR/CSE508_Winter2023_Dataset/CSE508_Winter2023_Dataset'

# Print contents of 5 sample files before and after extraction
sample_files = ['cranfield0001', 'cranfield0002', 'cranfield0003', 'cranfield0004', 'cranfield0005']
for filename in sample_files:
    filepath = os.path.join(dataset_path, filename)
    with open(filepath, 'rb') as f:
        original_text = f.read().decode('utf-8', errors='ignore')
    extract_text(filepath)
    with open(filepath, 'rb') as f:
        modified_text = f.read().decode('utf-8', errors='ignore')
    print(f'Reading contents of file {filename} before extraction:')
    print(original_text)
    print(f'Reading contents of file {filename} after extraction:')
    print(modified_text)
    print('---')

# Loop over remaining files in the dataset and extract relevant text
for filename in os.listdir(dataset_path):
    if filename.startswith('cranfield') and not filename.endswith('.sgm') and filename not in sample_files:
        filepath = os.path.join(dataset_path, filename)
        extract_text(filepath)

Reading contents of file cranfield0001 before extraction:
<DOC>
<DOCNO>
1
</DOCNO>
<TITLE>
experimental investigation of the aerodynamics of a
wing in a slipstream .
</TITLE>
<AUTHOR>
brenckman,m.
</AUTHOR>
<BIBLIO>
j. ae. scs. 25, 1958, 324.
</BIBLIO>
<TEXT>
  an experimental study of a wing in a propeller slipstream was
made in order to determine the spanwise distribution of the lift
increase due to slipstream at different angles of attack of the wing
and at different free stream to slipstream velocity ratios .  the
results were intended in part as an evaluation basis for different
theoretical treatments of this problem .
  the comparative span loading curves, together with supporting
evidence, showed that a substantial part of the lift increment
produced by the slipstream was due to a /destalling/ or boundary-layer-control
effect .  the integrated remaining lift increment,
after subtracting this destalling lift, was found to agree
well with a potential flow theory .
  an empirical e

In [29]:
import os
import re
import string
import nltk
from stop_words import get_stop_words

# define path to dataset
dataset_path = 'C:/Users/kabir/IIITD/IR/CSE508_Winter2023_Dataset/CSE508_Winter2023_Dataset'

# function to perform all preprocessing steps
def preprocess_text(file):
    # read file contents
    with open(file, 'r') as f:
        text = f.read()

    # lowercase the text
    text = text.lower()
    with open(file, 'w') as f:
        f.write(text)

    # tokenize the text
    tokens = nltk.word_tokenize(text)

    # remove stopwords
    stop_words = set(get_stop_words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # remove punctuations
    tokens = [token for token in tokens if token not in string.punctuation]

    # remove blank space tokens
    tokens = [token for token in tokens if token.strip()]

    # write preprocessed text to file
    with open(file, 'w') as f:
        f.write(' '.join(tokens))

    return ' '.join(tokens)

# Print contents of 5 sample files before and after each preprocessing step
sample_files = ['cranfield0001', 'cranfield0002', 'cranfield0003', 'cranfield0004', 'cranfield0005']
for filename in sample_files:
    filepath = os.path.join(dataset_path, filename)
    with open(filepath, 'r') as f:
        original_text = f.read()
    lowercase_text = original_text.lower()
    print(f'Reading contents of file {filename} before lowercase:')
    print(original_text)
    print(f'Reading contents of file {filename} after lowercase:')
    print(lowercase_text)
    print('---')
    tokenized_text = nltk.word_tokenize(lowercase_text)
    print(f'Reading contents of file {filename} before tokenization:')
    print(lowercase_text)
    print(f'Reading contents of file {filename} after tokenization:')
    print(tokenized_text)
    print('---')
    stopword_filtered_text = [token for token in tokenized_text if token not in get_stop_words('english')]
    print(f'Reading contents of file {filename} before stopword removal:')
    print(tokenized_text)
    print(f'Reading contents of file {filename} after stopword removal:')
    print(stopword_filtered_text)
    print('---')
    punctuation_filtered_text = [token for token in stopword_filtered_text if token not in string.punctuation]
    print(f'Reading contents of file {filename} before punctuation removal:')
    print(stopword_filtered_text)
    print(f'Reading contents of file {filename} after punctuation removal:')
    print(punctuation_filtered_text)
    print('---')
    blank_space_filtered_text = [token for token in punctuation_filtered_text if token.strip()]
    print(f'Reading contents of file {filename} before blank space removal:')
    print(punctuation_filtered_text)
    print(f'Reading contents of file {filename} after blank space removal:')
    print(blank_space_filtered_text)
    print('---')

# Preprocess all files in the dataset
for filename in os.listdir(dataset_path):
    if filename.startswith('cranfield') and not filename.endswith('.sgm'):
        filepath = os.path.join(dataset_path, filename)
        preprocess_text(filepath)

Reading contents of file cranfield0001 before lowercase:

experimental investigation of the aerodynamics of a
wing in a slipstream .
 
  an experimental study of a wing in a propeller slipstream was
made in order to determine the spanwise distribution of the lift
increase due to slipstream at different angles of attack of the wing
and at different free stream to slipstream velocity ratios .  the
results were intended in part as an evaluation basis for different
theoretical treatments of this problem .
  the comparative span loading curves, together with supporting
evidence, showed that a substantial part of the lift increment
produced by the slipstream was due to a /destalling/ or boundary-layer-control
effect .  the integrated remaining lift increment,
after subtracting this destalling lift, was found to agree
well with a potential flow theory .
  an empirical evaluation of the destalling effects was made for
the specific configuration of the experiment .

Reading contents of file cra

In [30]:
import os
import numpy as np
from collections import defaultdict
from math import log

# define path to dataset
dataset_path = 'C:/Users/kabir/IIITD/IR/CSE508_Winter2023_Dataset/CSE508_Winter2023_Dataset'

# Step 2: Create a matrix of size no. of documents x vocab size.
vocab = set()
doc_freq = defaultdict(int)
doc_term_freq = {}
for filename in os.listdir(dataset_path):
    if filename.startswith('cranfield') and not filename.endswith('.sgm'):
        filepath = os.path.join(dataset_path, filename)
        with open(filepath, 'r') as f:
            text = f.read()
        tokens = text.split()
        term_freq = defaultdict(int)
        for token in tokens:
            vocab.add(token)
            term_freq[token] += 1
        doc_term_freq[filename] = term_freq
        for token in term_freq.keys():
            doc_freq[token] += 1

vocab = sorted(list(vocab))
doc_ids = sorted(list(doc_term_freq.keys()))
num_docs = len(doc_ids)
vocab_size = len(vocab)
tf_idf_matrix = np.zeros((num_docs, vocab_size))

# Step 3: Fill in the tf-idf values for each term in the vocabulary in the matrix.
for i, doc_id in enumerate(doc_ids):
    term_freq = doc_term_freq[doc_id]
    for j, term in enumerate(vocab):
        if term in term_freq:
            tf = term_freq[term]
            tf_weight = 0
            if tf > 0:
                tf_weight = 1 + log(tf)
            idf = log(num_docs / (doc_freq[term] + 1))
            tf_idf_matrix[i, j] = tf_weight * idf

# Step 4: Construct the query vector of size vocab.
query = input("Enter query: ")
query_tokens = query.split()
query_vector = np.zeros((1, vocab_size))
for term in query_tokens:
    if term in vocab:
        j = vocab.index(term)
        query_vector[0, j] += 1

# Step 5: Compute the TF-IDF score for the query using the TF-IDF matrix. Report the top 5 relevant documents based on the score.
query_tf_idf = np.multiply(query_vector, tf_idf_matrix)
scores = np.sum(query_tf_idf, axis=1)
top_docs_indices = np.argsort(scores)[::-1][:5]
print("Top 5 relevant documents:")
for i, doc_index in enumerate(top_docs_indices):
    print(f"{i+1}. {doc_ids[doc_index]} (score: {scores[doc_index]})")

# Step 6: Use all 5 weighting schemes for term frequency calculation and report the TF-IDF score and results for each scheme separately.
tf_weights = ['binary', 'raw_count', 'term_frequency', 'log_normalization', 'double_normalization']
for tf_weight in tf_weights:
    print(f"\nTF weighting scheme: {tf_weight}")
    tf_idf_matrix = np.zeros((num_docs, vocab_size))
    for i, doc_id in enumerate(doc_ids):
        term_freq = doc_term_freq[doc_id]
        for j, term in enumerate(vocab):
            if term in term_freq:
                tf = term_freq[term]
                if tf_weight == 'binary':
                    tf_weight_val = 1 if tf > 0 else 0
                elif tf_weight == 'raw_count':
                    tf_weight_val = tf
                elif tf_weight == 'term_frequency':
                    tf_weight_val = tf / sum(term_freq.values())
                elif tf_weight == 'log_normalization':
                    tf_weight_val = log(1 + tf)
                elif tf_weight == 'double_normalization':
                    max_freq = max(term_freq.values())
                    tf_weight_val = 0.5 + 0.5 * (tf / max_freq)
                idf = log(num_docs / (doc_freq[term] + 1))
                tf_idf_matrix[i, j] = tf_weight_val * idf

    query_tf_idf = np.multiply(query_vector, tf_idf_matrix)
    scores = np.sum(query_tf_idf, axis=1)
    top_docs_indices = np.argsort(scores)[::-1][:5]
    print("Top 5 relevant documents:")
    for i, doc_index in enumerate(top_docs_indices):
        print(f"{i+1}. {doc_ids[doc_index]} (score: {scores[doc_index]})")

Top 5 relevant documents:
1. cranfield0185 (score: 8.545318795204752)
2. cranfield0033 (score: 6.703981795470231)
3. cranfield0225 (score: 6.703981795470231)
4. cranfield0216 (score: 5.297733526376235)
5. cranfield1054 (score: 5.047002938267131)

TF weighting scheme: binary
Top 5 relevant documents:
1. cranfield0033 (score: 6.703981795470231)
2. cranfield0225 (score: 6.703981795470231)
3. cranfield1313 (score: 5.047002938267131)
4. cranfield0244 (score: 5.047002938267131)
5. cranfield1054 (score: 5.047002938267131)

TF weighting scheme: raw_count
Top 5 relevant documents:
1. cranfield0216 (score: 14.912809714827905)
2. cranfield0185 (score: 10.094005876534261)
3. cranfield0124 (score: 9.941873143218604)
4. cranfield0426 (score: 9.941873143218604)
5. cranfield1271 (score: 8.284894286015502)

TF weighting scheme: term_frequency
Top 5 relevant documents:
1. cranfield0031 (score: 0.1440851180176609)
2. cranfield0920 (score: 0.1380815714335917)
3. cranfield0041 (score: 0.12427341429023253)


In [32]:
# Step 4: Create sets of the document and query tokens, compute intersection and union for each document and the query, and calculate Jaccard coefficient for each document.
query_set = set(query_tokens)
jaccard_scores = []
for i, doc_id in enumerate(doc_ids):
    doc_set = set(doc_term_freq[doc_id].keys())
    intersection = len(query_set.intersection(doc_set))
    union = len(query_set.union(doc_set))
    jaccard_coefficient = intersection / union
    jaccard_scores.append((doc_id, jaccard_coefficient))

# Step 5: Sort the documents by Jaccard coefficient and present the top 10 documents.
jaccard_scores = sorted(jaccard_scores, key=lambda x: x[1], reverse=True)
print("Top 10 relevant documents based on Jaccard coefficient:")
for i, (doc_id, jaccard_coefficient) in enumerate(jaccard_scores[:10]):
    print(f"{i+1}. {doc_id} (Jaccard coefficient: {jaccard_coefficient})")

Top 10 relevant documents based on Jaccard coefficient:
1. cranfield0031 (Jaccard coefficient: 0.04)
2. cranfield0920 (Jaccard coefficient: 0.03333333333333333)
3. cranfield0429 (Jaccard coefficient: 0.03225806451612903)
4. cranfield0774 (Jaccard coefficient: 0.03225806451612903)
5. cranfield1306 (Jaccard coefficient: 0.029411764705882353)
6. cranfield0430 (Jaccard coefficient: 0.027777777777777776)
7. cranfield0301 (Jaccard coefficient: 0.02702702702702703)
8. cranfield0512 (Jaccard coefficient: 0.02702702702702703)
9. cranfield1266 (Jaccard coefficient: 0.02702702702702703)
10. cranfield0175 (Jaccard coefficient: 0.02631578947368421)
