In [1]:
import os
import re
import numpy as np
import pandas as pd
import math

In [2]:
def open_file(directory):
  documents = []
  for filenames in os.listdir(directory):
    if filenames.endswith(".txt"):
      with open(os.path.join(directory, filenames), 'r') as f:
        documents.append(f.read())
  return documents

In [3]:
doc_files = open_file('/content/ebooks')

In [4]:
doc_files

 '\ufeffThe Project Gutenberg eBook of The Yellow Wallpaper\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: The Yellow Wallpaper\n\nAuthor: Charlotte Perkins Gilman\n\nRelease date: November 1, 1999 [eBook #1952]\n                Most recently updated: August 31, 2024\n\nLanguage: English\n\nCredits: An Anonymous Volunteer and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK THE YELLOW WALLPAPER ***\n\n\n\n\nThe Yellow Wallpaper\n\nBy Charlotte Perkins Gilman\n\n\nIt is very seldom that mere ordinary people like John and myself secure\nancestral halls for the summer

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [7]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [8]:
preprocessed_docs = [preprocess(doc) for doc in doc_files]

In [9]:
[preprocessed_docs[0]]

[['project',
  'gutenberg',
  'ebook',
  'adventure',
  'tom',
  'sawyer',
  'complete',
  'ebook',
  'use',
  'anyone',
  'anywhere',
  'united',
  'state',
  'part',
  'world',
  'cost',
  'almost',
  'restriction',
  'whatsoever',
  'may',
  'copy',
  'give',
  'away',
  'reuse',
  'term',
  'project',
  'gutenberg',
  'license',
  'included',
  'ebook',
  'online',
  'wwwgutenbergorg',
  'located',
  'united',
  'state',
  'check',
  'law',
  'country',
  'located',
  'using',
  'ebook',
  'title',
  'adventure',
  'tom',
  'sawyer',
  'complete',
  'author',
  'mark',
  'twain',
  'release',
  'date',
  'july',
  'ebook',
  'recently',
  'updated',
  'august',
  'language',
  'english',
  'credit',
  'david',
  'widger',
  'start',
  'project',
  'gutenberg',
  'ebook',
  'adventure',
  'tom',
  'sawyer',
  'complete',
  'adventure',
  'tom',
  'sawyer',
  'mark',
  'twain',
  'samuel',
  'langhorne',
  'clemens',
  'content',
  'chapter',
  'youu',
  'tomaunt',
  'polly',
  'deci

In [10]:
for i, tokens in enumerate(preprocessed_docs):
    print(f"Tokenized Document {i + 1}: {tokens}")

Tokenized Document 2: ['project', 'gutenberg', 'ebook', 'yellow', 'wallpaper', 'ebook', 'use', 'anyone', 'anywhere', 'united', 'state', 'part', 'world', 'cost', 'almost', 'restriction', 'whatsoever', 'may', 'copy', 'give', 'away', 'reuse', 'term', 'project', 'gutenberg', 'license', 'included', 'ebook', 'online', 'wwwgutenbergorg', 'located', 'united', 'state', 'check', 'law', 'country', 'located', 'using', 'ebook', 'title', 'yellow', 'wallpaper', 'author', 'charlotte', 'perkins', 'gilman', 'release', 'date', 'november', 'ebook', 'recently', 'updated', 'august', 'language', 'english', 'credit', 'anonymous', 'volunteer', 'david', 'widger', 'start', 'project', 'gutenberg', 'ebook', 'yellow', 'wallpaper', 'yellow', 'wallpaper', 'charlotte', 'perkins', 'gilman', 'seldom', 'mere', 'ordinary', 'people', 'like', 'john', 'secure', 'ancestral', 'hall', 'summer', 'colonial', 'mansion', 'hereditary', 'estate', 'would', 'say', 'haunted', 'house', 'reach', 'height', 'romantic', 'felicitybut', 'would

In [11]:
def term_frequency(term, document):
    return document.count(term) / len(document)

In [12]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [13]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [14]:
vocab = set([word for doc in preprocessed_docs for word in doc])
vocab = sorted(vocab)

In [15]:
vocab

['aaa',
 'aadead',
 'abandoned',
 'abash',
 'abashed',
 'abate',
 'abbey',
 'abed',
 'abel',
 'abhorred',
 'abhors',
 'abide',
 'abject',
 'ablaze',
 'able',
 'ableeding',
 'ableto',
 'aboard',
 'abode',
 'abounding',
 'aboundst',
 'aboutthat',
 'aboutthen',
 'abraham',
 'abram',
 'abreast',
 'abroach',
 'abroad',
 'absent',
 'absently',
 'absentminded',
 'absentmindedness',
 'absolutely',
 'absolvd',
 'absorb',
 'absorbed',
 'absorbing',
 'abstain',
 'absurd',
 'absurdthe',
 'abundance',
 'abundant',
 'abusd',
 'abuse',
 'abused',
 'abusing',
 'abyss',
 'academy',
 'accent',
 'accept',
 'accepted',
 'accepting',
 'access',
 'accessed',
 'accessible',
 'accident',
 'accidental',
 'accidentally',
 'accommodate',
 'accompanied',
 'accompanying',
 'accomplish',
 'accomplished',
 'accomplishing',
 'accord',
 'accordance',
 'according',
 'accosted',
 'account',
 'accoutrement',
 'accuracy',
 'accursd',
 'accused',
 'accusing',
 'accustomd',
 'accustomed',
 'ache',
 'ached',
 'achieve',
 'ac

In [16]:
preprocessed_docs


[['project',
  'gutenberg',
  'ebook',
  'adventure',
  'tom',
  'sawyer',
  'complete',
  'ebook',
  'use',
  'anyone',
  'anywhere',
  'united',
  'state',
  'part',
  'world',
  'cost',
  'almost',
  'restriction',
  'whatsoever',
  'may',
  'copy',
  'give',
  'away',
  'reuse',
  'term',
  'project',
  'gutenberg',
  'license',
  'included',
  'ebook',
  'online',
  'wwwgutenbergorg',
  'located',
  'united',
  'state',
  'check',
  'law',
  'country',
  'located',
  'using',
  'ebook',
  'title',
  'adventure',
  'tom',
  'sawyer',
  'complete',
  'author',
  'mark',
  'twain',
  'release',
  'date',
  'july',
  'ebook',
  'recently',
  'updated',
  'august',
  'language',
  'english',
  'credit',
  'david',
  'widger',
  'start',
  'project',
  'gutenberg',
  'ebook',
  'adventure',
  'tom',
  'sawyer',
  'complete',
  'adventure',
  'tom',
  'sawyer',
  'mark',
  'twain',
  'samuel',
  'langhorne',
  'clemens',
  'content',
  'chapter',
  'youu',
  'tomaunt',
  'polly',
  'deci

In [17]:
doc_tfidf_vectors = [compute_tfidf(doc, preprocessed_docs, vocab) for doc in preprocessed_docs]


In [18]:
doc_tfidf_vectors

[array([0.00000000e+00, 2.44415890e-05, 9.77663562e-05, ...,
        2.44415890e-05, 0.00000000e+00, 0.00000000e+00]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00011388]),
 array([7.21716077e-05, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 2.16514823e-04, 0.00000000e+00])]

In [21]:
queries = ["winniethepooh went", "big sky","Enter Sampson and Gregory armed with swords and bucklers","There is a delicious garden!"]

tokenized_queries = [preprocess(query) for query in queries]

query_tfidf_vectors = [compute_tfidf(query, preprocessed_docs, vocab) for query in tokenized_queries]

# for i, vec in enumerate(query_tfidf_vectors):
#     print(f"TF-IDF Vector for Query '{queries[i]}': {vec}")

In [19]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


In [22]:
cosine_similarities = []
for query_vector in query_tfidf_vectors:
    similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
    cosine_similarities.append(similarities)


In [23]:
cosine_similarities

[[0.004716332320102101,
  0.0012944850630437184,
  0.004603374290445432,
  7.779144281496628e-05,
  0.12637618059719216],
 [0.0035486074671462536,
  0.008393891419000333,
  0.0019383919940400278,
  0.0005044267739798153,
  0.009328321063254322],
 [0.0008394896061305753, 0.0, 7.659646362403322e-05, 0.04701385166774868, 0.0],
 [0.0010563782802165556,
  0.0040591971400530035,
  0.0011548069135959456,
  0.0,
  0.0]]

In [24]:
cosine_similarities = []
for query_vector in query_tfidf_vectors:
    similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
    cosine_similarities.append(similarities)

for i, query in enumerate(queries):
    print(f"\nCosine similarities for query '{query}':")
    if i < len(cosine_similarities):
        for j, doc in enumerate(doc_files):
            if j < len(cosine_similarities[i]):
                print(f"Document {j + 1}: {cosine_similarities[i][j]:.4f}")
            else:
                print(f"Document {j + 1}: No similarity calculated")
    else:
        print("No similarities calculated for this query.")


Cosine similarities for query 'winniethepooh went':
Document 1: 0.0047
Document 2: 0.0013
Document 3: 0.0046
Document 4: 0.0001
Document 5: 0.1264

Cosine similarities for query 'big sky':
Document 1: 0.0035
Document 2: 0.0084
Document 3: 0.0019
Document 4: 0.0005
Document 5: 0.0093

Cosine similarities for query 'Enter Sampson and Gregory armed with swords and bucklers':
Document 1: 0.0008
Document 2: 0.0000
Document 3: 0.0001
Document 4: 0.0470
Document 5: 0.0000

Cosine similarities for query 'There is a delicious garden!':
Document 1: 0.0011
Document 2: 0.0041
Document 3: 0.0012
Document 4: 0.0000
Document 5: 0.0000


In [25]:
for i, query in enumerate(queries):
    print(f"\nCosine similarities for query '{query}':")
    for j, doc in enumerate(doc_files):
        print(f"Document {j + 1}: {cosine_similarities[i][j]:.4f}")


Cosine similarities for query 'winniethepooh went':
Document 1: 0.0047
Document 2: 0.0013
Document 3: 0.0046
Document 4: 0.0001
Document 5: 0.1264

Cosine similarities for query 'big sky':
Document 1: 0.0035
Document 2: 0.0084
Document 3: 0.0019
Document 4: 0.0005
Document 5: 0.0093

Cosine similarities for query 'Enter Sampson and Gregory armed with swords and bucklers':
Document 1: 0.0008
Document 2: 0.0000
Document 3: 0.0001
Document 4: 0.0470
Document 5: 0.0000

Cosine similarities for query 'There is a delicious garden!':
Document 1: 0.0011
Document 2: 0.0041
Document 3: 0.0012
Document 4: 0.0000
Document 5: 0.0000


In [26]:
# output_file = 'Kshitiz_Shah'
# with open(output_file, 'w') as w:
#     w.write(f"\nCosine similarities for query '{query}':")
#     for j, doc in enumerate(doc_files):
#         w.write(f"Document {j + 1}: {cosine_similarities[i][j]:.4f}")
