In [1]:
from docx import Document
import re
from tabulate import tabulate

from docx import Document: Imports the Document class from the python-docx library to read Word documents.

import re: Imports the re module for regular expression operations.

from tabulate import tabulate: Imports the tabulate function from the tabulate library to create nicely formatted tables.

In [2]:
def read_docx(file_path):
    doc = Document("C:/Users/Y Archana/Downloads/New Microsoft Word Document.docx")
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

read_docx(file_path): This function reads the content of a .docx file specified by file_path.
It iterates through each paragraph in the document and appends the text to a list.
It then joins all the paragraphs into a single string with newline characters.

In [3]:
def classify_word(word):
    # List of common function words
    function_words = {
        'articles': {'the', 'a', 'an'},
        'prepositions': {'in', 'on', 'at', 'by', 'with', 'about', 'against', 'among', 'between', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'under', 'over', 'again', 'further', 'then', 'once'},
        'conjunctions': {'and', 'but', 'or', 'nor', 'so', 'for', 'yet'},
        'pronouns': {'he', 'she', 'it', 'they', 'we', 'you', 'I', 'me', 'him', 'her', 'us', 'them'},
        'determiners': {'this', 'that', 'these', 'those', 'my', 'your', 'his', 'her', 'its', 'our', 'their'},
        'modals': {'can', 'could', 'may', 'might', 'will', 'would', 'shall', 'should', 'must'},
        'auxiliary_verbs': {'am', 'is', 'are', 'was', 'were', 'being', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did'}
    }

    if word in function_words['articles']:
        return 'article'
    elif word in function_words['prepositions']:
        return 'preposition'
    elif word in function_words['conjunctions']:
        return 'conjunction'
    elif word in function_words['pronouns']:
        return 'pronoun'
    elif word in function_words['determiners']:
        return 'determiner'
    elif word in function_words['modals']:
        return 'modal'
    elif word in function_words['auxiliary_verbs']:
        return 'auxiliary verb'
    elif word.endswith('ly'):
        return 'adverb'
    elif word.endswith('ing') or word.endswith('ed'):
        return 'verb'
    elif word.endswith('ous') or word.endswith('ful') or word.endswith('ic'):
        return 'adjective'
    else:
        return 'noun'  # Default assumption

classify_word(word): This function classifies a word based on simple heuristics into article,adverbs, verbs, adjectives, nouns, etc

In [4]:
def count_vowels_consonants(word):
    vowels = 'aeiou'
    num_vowels = sum(1 for char in word if char in vowels)
    num_consonants = len(word) - num_vowels
    return num_vowels, num_consonants

count_vowels_consonants(word): This function counts the number of vowels and consonants in a word and returns these counts.

In [5]:
def process_text_to_dict(text):
    word_dict = {}
    words = re.findall(r'\b\w+\b', text.lower())

    for word in words:
        word_type = classify_word(word)
        num_vowels, num_consonants = count_vowels_consonants(word)
        if word in word_dict:
            word_dict[word]['frequency'] += 1
        else:
            word_dict[word] = {
                'frequency': 1,
                'length': len(word),
                'type': word_type,
                'vowels': num_vowels,
                'consonants': num_consonants
            }
    return word_dict

process_text_to_dict(text): This function processes the input text and creates a dictionary where each key is a word, and the value is another dictionary containing its frequency, length, type, number of vowels, and number of consonants.

words = re.findall(r'\b\w+\b', text.lower()): This line extracts all words from the text and converts them to lowercase.

The loop then populates the word_dict with information about each word.

In [6]:
def get_word_attributes(word_dict):
    table_data = []
    for word, attrs in word_dict.items():
        table_data.append([word, attrs['frequency'], attrs['length'], attrs['type'], attrs['vowels'], attrs['consonants']])
    return table_data

get_word_attributes(word_dict): This function converts the dictionary into a list of lists suitable for creating a table. Each sublist contains the word and its attributes.

In [7]:
file_path = "C:/Users/Y Archana/Downloads/New Microsoft Word Document.docx"
doc_string = read_docx(file_path)
word_dict = process_text_to_dict(doc_string)
word_attributes_table = get_word_attributes(word_dict)

headers = ["Word", "Frequency", "Length", "Type", "Vowels", "Consonants"]
print(tabulate(word_attributes_table, headers, tablefmt="grid"))

+----------------+-------------+----------+----------------+----------+--------------+
| Word           |   Frequency |   Length | Type           |   Vowels |   Consonants |
| thereby        |           1 |        7 | noun           |        2 |            5 |
+----------------+-------------+----------+----------------+----------+--------------+
| mini           |           1 |        4 | noun           |        2 |            2 |
+----------------+-------------+----------+----------------+----------+--------------+
| sizing         |           1 |        6 | verb           |        2 |            4 |
+----------------+-------------+----------+----------------+----------+--------------+
| subjectivity   |           1 |       12 | noun           |        4 |            8 |
+----------------+-------------+----------+----------------+----------+--------------+
| in             |          18 |        2 | preposition    |        1 |            1 |
+----------------+-------------+----------+

Reading the document: doc_string contains the text read from the specified Word document.

Processing the text: word_dict is a dictionary with each word's details.

Creating the table: word_attributes_table is a list of lists with each word and its attributes.

Printing the table: The tabulate function prints the word attributes in a nicely formatted table with specified headers.