In [4]:
from docx import Document

In [6]:
def read_word_files(file_paths):
    all_content = []
    for file_path in file_paths:
        try:
            doc = Document(file_path)
            text = []
            for paragraph in doc.paragraphs:
                text.append(paragraph.text)
            all_content.append('\n'.join(text))
        except Exception as e:
            print(f"Error reading the Word file '{file_path}': {e}")
    return all_content

word_file_paths = [r"D:\code\NLP\doc\Doc1.docx", r"D:\code\NLP\doc\Doc 2.docx", r"D:\code\NLP\doc\Doc 3.docx", r"D:\code\NLP\doc\Doc 4.docx", r"D:\code\NLP\doc\Doc 5.docx", r"D:\code\NLP\doc\Doc 6.docx"]
contents = read_word_files(word_file_paths)

In [8]:
if contents:
    for i, content in enumerate(contents, start=1):
        print(f"Content of file {i}:")
        print(content)

Content of file 1:
Formula One, commonly known as Formula 1 or F1, is the highest class of international racing for open-wheel single-seater formula racing cars sanctioned by the Fédération Internationale de l'Automobile (FIA). The FIA Formula One World Championship has been one of the premier forms of racing around the world since its inaugural running in 1950. The word formula in the name refers to the set of rules to which all participants' cars must conform. A Formula One season consists of a series of races, known as Grands Prix. Grands Prix take place in multiple countries and continents around the world on either purpose-built circuits or closed public roads.

A point-system is used at Grands Prix to determine two annual World Championships: one for the drivers, and one for the constructors (the teams). Each driver must hold a valid Super Licence, the highest class of racing licence issued by the FIA, and the races must be held on grade one tracks, the highest grade-rating issue

In [9]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [13]:
def tokenize_documents(contents):
    tokenized_docs = []
    for doc_content in contents:
        words = word_tokenize(doc_content)
        sentences = sent_tokenize(doc_content)
        tokenized_docs.append((words, sentences))
    return tokenized_docs

tokenized_docs = tokenize_documents(contents)

In [14]:
for i, (words, sentences) in enumerate(tokenized_docs, start=1):
    print(f"\nTokens for Document {i}:")
    print("Words:", words)
    print("Sentences:", sentences)


Tokens for Document 1:
Words: ['Formula', 'One', ',', 'commonly', 'known', 'as', 'Formula', '1', 'or', 'F1', ',', 'is', 'the', 'highest', 'class', 'of', 'international', 'racing', 'for', 'open-wheel', 'single-seater', 'formula', 'racing', 'cars', 'sanctioned', 'by', 'the', 'Fédération', 'Internationale', 'de', "l'Automobile", '(', 'FIA', ')', '.', 'The', 'FIA', 'Formula', 'One', 'World', 'Championship', 'has', 'been', 'one', 'of', 'the', 'premier', 'forms', 'of', 'racing', 'around', 'the', 'world', 'since', 'its', 'inaugural', 'running', 'in', '1950', '.', 'The', 'word', 'formula', 'in', 'the', 'name', 'refers', 'to', 'the', 'set', 'of', 'rules', 'to', 'which', 'all', 'participants', "'", 'cars', 'must', 'conform', '.', 'A', 'Formula', 'One', 'season', 'consists', 'of', 'a', 'series', 'of', 'races', ',', 'known', 'as', 'Grands', 'Prix', '.', 'Grands', 'Prix', 'take', 'place', 'in', 'multiple', 'countries', 'and', 'continents', 'around', 'the', 'world', 'on', 'either', 'purpose-built',

In [11]:
def unique_words_in_documents(tokenized_docs):
    unique_words_per_doc = []
    for words, _ in tokenized_docs:
        unique_words = set(words)
        unique_words_per_doc.append(unique_words)
    return unique_words_per_doc

unique_words_per_doc = unique_words_in_documents(tokenized_docs)

In [12]:
for i, unique_words in enumerate(unique_words_per_doc, start=1):
    print(f"\nUnique words in Document {i}:")
    print(unique_words)


Unique words in Document 1:
{'conform', 'formula', 'racing', 'F1', 'roads', 'teams', 'the', 'valid', 'running', 'consists', 'annual', 'forms', 'sanctioned', 'Internationale', 'premier', 'held', 'been', 'word', 'One', 'name', 'since', 'Each', 'its', 'constructors', "l'Automobile", '1950', 'place', 'continents', 'rules', 'purpose-built', 'is', 'or', 'in', 'multiple', 'open-wheel', 'must', 'used', 'and', 'known', 'on', 'highest', 'World', 'for', 'refers', 'be', 'all', 'Super', 'series', 'de', 'determine', 'driver', 'Championship', 'either', 'grade', 'international', 'closed', "'", '1', 'licence', 'by', ':', 'take', 'Grands', 'a', 'which', 'issued', 'public', 'two', 'has', 'single-seater', 'A', 'Prix', 'one', 'hold', 'tracks', 'circuits', 'at', 'The', 'FIA', 'around', 'class', 'races', 'commonly', 'Formula', 'grade-rating', ')', 'inaugural', 'cars', ',', 'drivers', 'set', 'point-system', 'participants', 'Championships', '.', '(', 'as', 'season', 'Fédération', 'to', 'countries', 'of', 'wor

In [15]:
def combine_unique_words(unique_words_per_doc):
    combined_unique_words = set()
    for unique_words in unique_words_per_doc:
        combined_unique_words.update(unique_words)
    return combined_unique_words

combined_unique_words = combine_unique_words(unique_words_per_doc)

In [16]:
print("\nCombined unique words from all documents:")
print(combined_unique_words)


Combined unique words from all documents:
{'1990s', 'conform', 'Mercedes-Benz', 'formula', 'Max', 'force', 'before', 'often', 'magazines', 'Oxford', 'start', '31', 'pit', 'them', 'racing', 'teams', 'valid', '2000', 'years', 'if', 'receives', 'action', 'it', 'part', 'latter', 'ability', 'both', 'Williams-Ford', 'until', 'forms', 'nine', 'order', 'sanctioned', 'traction', 'Lewis', 'car', 'wall', 'envisioned', 'premier', 'track', 'word', 'power', 'One', 'many', 'circuit', 'followed', 'former', 'sole', 'teammate', 'name', 'crashing', 'amateurs', 'grand', 'Fawkes', 'tyres', 'Day', 'again', 'qualifying', 'since', 'Each', 'its', "l'Automobile", '1950', 'designer', '97', 'place', 'he', 'shake-up', 'later', 'are', 'allows', 'complete', 'or', 'in', 'Alain', 'return', 'similar', 'Dictionary', '2.4-litre', 'between', 'multiple', 'open-wheel', 'that', 'York', 'must', 'used', 'winning', 'freely', 'out', '1.6-litre', 'claiming', 'may', 'collided', 'any', 'for', 'record', 'engine', '2001', 'combined'

In [22]:
import string

def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('[', '').replace(']', '').replace('–', '')
    return text

preprocessed_contents = [preprocess_text(content) for content in contents]
preprocessed_tokenized_docs = [word_tokenize(content) for content in preprocessed_contents]

def get_unique_words(tokenized_docs):
    unique_words = set()
    for words in tokenized_docs:
        unique_words.update(words)
    return unique_words

unique_words_after_preprocessing = get_unique_words(preprocessed_tokenized_docs)

print("Unique words after preprocessing:")
print(unique_words_after_preprocessing)

Unique words after preprocessing:
{'1990s', 'conform', 'before', 'Max', 'often', 'Oxford', 'teams', '2000', 'if', 'it', 'latter', 'both', 'traction', 'envisioned', 'car', 'wall', 'premier', 'track', 'youngestever', 'power', 'many', 'circuit', 'former', 'name', 'grand', 'Fawkes', 'tyres', 'qualifying', 'since', '1950', 'designer', 'later', 'are', 'complete', 'in', 'Alain', 'return', 'between', 'multiple', 'that', 'York', 'must', 'used', 'winning', 'freely', 'out', 'may', 'collided', 'WilliamsHonda', 'any', 'openwheel', 'record', 'engine', '2001', 'from', 'months', 'lost', 'also', 'seasons', 'Ford', 'no', 'de', 'Grand', 'closely', 'driver', 'Championship', 'team', 'an', 'about', '2013', 'democratically', 'internet', 'aspirated', 'shakeup', 'lAutomobile', 'grip', 'starting', 'two', 'mistake', 'graderating', 'necessary', 'peruse', '1857', 'they', 'lap', 'share', 'purposebuilt', 'It', 'championship', 'taking', '24litre', 'Formula', 'weekend', 'rivalry', 'aquaplaning', 'their', 'probably', '

In [31]:
unique_words_list = sorted(list(unique_words_after_preprocessing))

In [43]:
import pandas as pd

In [56]:
df = pd.DataFrame(unique_words_list, columns=["Word"])

for doc in range(1, 7):
    df[f"Doc{doc}"] = df['Word'].apply(lambda word: 1 if word in preprocessed_tokenized_docs[doc-1] else 0)

In [57]:
print(unique_words_list)

['1', '16', '16litre', '18', '1857', '19', '1950', '1980s', '1983', '1988', '1990s', '1993', '1994', '20', '2000', '2001', '2013', '2014', '2015', '2016', '24litre', '5', '70', 'A', 'According', 'Alain', 'All', 'Australian', 'Ayrton', 'Bianchi', 'Brabham', 'Canadian', 'Championship', 'Championships', 'Chenevix', 'Day', 'Dictionary', 'Dr', 'Drivers', 'Each', 'English', 'F1', 'F1s', 'FIA', 'Fawkes', 'Ferrari', 'Ford', 'Formula', 'Fédération', 'Grand', 'Grands', 'Guy', 'Haas', 'Hamilton', 'Honda', 'However', 'If', 'Internationale', 'It', 'Italian', 'Japanese', 'Jules', 'Lewis', 'Library', 'Licence', 'London', 'Marino', 'Max', 'McLaren', 'McLarenMercedes', 'Mercedes', 'MercedesBenz', 'Nelson', 'New', 'Nico', 'No', 'November', 'On', 'One', 'Oxford', 'Piquet', 'Porsche', 'Powered', 'Prix', 'Prix31', 'Prost', 'Ratzenberger', 'Renault', 'Richard', 'Roland', 'Rosberg', 'San', 'Saturday', 'Senna', 'Simon', 'Since', 'Spain', 'Specifically', 'Super', 'Tamburello', 'The', 'This', 'To', 'Trench', 'V

In [58]:
df

Unnamed: 0,Word,Doc1,Doc2,Doc3,Doc4,Doc5,Doc6
0,1,1,0,0,0,0,0
1,16,0,0,1,0,0,0
2,16litre,0,0,1,0,0,0
3,18,0,0,1,0,0,0
4,1857,0,0,0,0,0,1
...,...,...,...,...,...,...,...
437,world,1,0,0,0,0,0
438,would,0,0,0,0,0,1
439,year,0,0,1,0,0,0
440,years,0,1,0,0,0,1


In [64]:
def binary_search_query(tokenized_docs, query_word):
    matching_docs = []
    for i, (words, _) in enumerate(tokenized_docs, start=1):
        if query_word in words:
            matching_docs.append(i)
    return matching_docs

query_word = "race"
result = binary_search_query(tokenized_docs, query_word)
print(f"Documents containing the word '{query_word}': {result}")

Documents containing the word 'race': [3, 4]


In [67]:
def binary_search_query(tokenized_docs, query_words):
    matching_docs = []
    for i, (words, _) in enumerate(tokenized_docs, start=1):
        if all(word in words for word in query_words):
            matching_docs.append(i)
    return matching_docs

query_words = ["forms", "formula"]
result = binary_search_query(tokenized_docs, query_words)
print(f"Documents containing all the words '{', '.join(query_words)}': {result}")

Documents containing all the words 'forms, formula': [1]


In [68]:
def binary_search_query(tokenized_docs, query_words, not_word):
    matching_docs = []
    for i, (words, _) in enumerate(tokenized_docs, start=1):
        if all(word in words for word in query_words) and not_word not in words:
            matching_docs.append(i)
    return matching_docs

query_words = ["forms", "formula"]
not_word = "grand"
result = binary_search_query(tokenized_docs, query_words, not_word)
print(f"Documents containing all the words '{', '.join(query_words)}' but not '{not_word}': {result}")

Documents containing all the words 'forms, formula' but not 'grand': [1]


In [70]:
def binary_search_query(tokenized_docs, query_words, not_words):
    matching_docs = []
    for i, (words, _) in enumerate(tokenized_docs, start=1):
        if all(word in words for word in query_words) and not any(not_word in words for not_word in not_words):
            matching_docs.append(i)
    return matching_docs

query_words = ["forms", "formula"]
not_words = ["track", "hundreds"]
result = binary_search_query(tokenized_docs, query_words, not_words)
print(f"Documents containing all the words '{', '.join(query_words)}' but not any of '{', '.join(not_words)}': {result}")

Documents containing all the words 'forms, formula' but not any of 'track, hundreds': [1]
