## Importing libraries

In [60]:
import numpy as np
import re
import math

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

import openai
from docx import Document

from tabulate import tabulate

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Data

In [3]:
# Set up your OpenAI API key

def generate_document_from_phrase(phrase, output_file):
    document = Document()

    prompt = f"Generate a document about: {phrase}. \n"

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": ""},
        ],
        temperature=0.7,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    generated_text = response.choices[0].message["content"]

    document.add_heading(phrase, level=1)
    document.add_paragraph(generated_text)

    document.save(output_file)
    print(f"Document saved to {output_file}")
    return output_file

In [4]:
# Example usage
phrases = ["The importance of renewable energy",
           "The future of artificial intelligence",
           "The impact of climate change on biodiversity"]

generated_documents = []
for index, phrase in enumerate(phrases):
    output_file = f"generated_document_{index + 1}.docx"
    generate_document_from_phrase(phrase, output_file)
    generated_documents.append(output_file)

Document saved to generated_document_1.docx
Document saved to generated_document_2.docx
Document saved to generated_document_3.docx


In [27]:
generated_documents

['generated_document_1.docx',
 'generated_document_2.docx',
 'generated_document_3.docx']

# Processing on Data

In [29]:
# Define the function to read a document and extract text
def read_document(file_path):
    doc = Document(file_path)
    # Extract text from the document
    doc_text = '\n'.join([para.text for para in doc.paragraphs])
    print("Document Text:", doc_text)
    return doc_text

In [43]:
def cleaned_and_normalized_text(text_content):
    # Step 0: Preprocessing steps
    text_content = re.sub(r'\[[^\]]*\]', '', text_content)  # Remove square brackets and anything inside them
    text_content = re.sub(r'\b[IVXLCDM]+\b', '', text_content)  # Remove Roman numerals
    text_content = re.sub(r'\d+', '', text_content)  # Remove digits

    # Step 1: Clean the text in each paragraph
    cleaned_text = ''.join(e for e in text_content if e.isalnum() or e.isspace())

    # Step 2: Normalize the cleaned text
    normalized_text = cleaned_text.lower()
    
    return normalized_text

In [38]:
def preprocess_text(normalized_text):  
    # Step 3: Tokenization
    tokens = word_tokenize(normalized_text)
    print("Step 3: Tokens:", tokens)
    
    # Step 4: Lemmatization with specified part of speech
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for word in tokens:
        # Get the part of speech for each word
        pos_tag = nltk.pos_tag([word])[0][1][0].upper()
        # Map the part of speech to WordNet constant
        pos_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
        wordnet_pos = pos_map.get(pos_tag, wordnet.NOUN)  # Default to noun if not recognized
        # Lemmatize the word with the specified part of speech
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        lemmatized_tokens.append(lemma)
    print("Step 4: Lemmatized Tokens:", lemmatized_tokens)
    return lemmatized_tokens


In [39]:
def stop_word_removel_and_get_unique_words(lemmatized_tokens):
    # Step 5: Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    print("Step 5: Filtered Tokens (Stopwords Removed):", filtered_tokens)
    
    # Step 6: Unique words
    unique_words = set(filtered_tokens)
    print("Step 6: Unique Words:", unique_words)
    return unique_words

In [40]:
def get_final_meaningful_words(unique_words):
    all_meaningful_words = []
    # Step 7: Remove non-meaningful words
    meaningful_words = []
    for word in unique_words:
        # add (and len(word) > 3) if you want to remove any word less than or equal to 3
        if wordnet.synsets(word):
            meaningful_words.append(word)
    print("Step 7: Meaningful Words:", meaningful_words)
    
    # Print short words (length <= 3)
    short_words = [word for word in meaningful_words if len(word) <= 3]
    if short_words:
        print("Short words (length <= 3):", ', '.join(short_words))
    else:
        print("No short words found.")
    
    # Appending meaningful words to respective lists
    all_meaningful_words.append(" ".join(meaningful_words))
    return all_meaningful_words

# All Processing on document 1 

In [41]:
text_content_doc_1 = read_document(generated_documents[0])

Document Text: The importance of renewable energy
Title: The Importance of Renewable Energy

Introduction
Renewable energy has become an increasingly important topic as the world confronts the challenges of climate change, environmental degradation, and the limited availability of fossil fuels. The shift towards renewable energy sources is crucial for a sustainable and secure energy future. This document will delve into the significance of renewable energy and its impact on the environment, economy, and energy security.

Environmental Impact
One of the most compelling reasons for the importance of renewable energy is its positive impact on the environment. Unlike fossil fuels, renewable energy sources such as solar, wind, hydro, and biomass do not produce harmful greenhouse gas emissions or air pollutants. By harnessing these clean energy sources, we can significantly reduce our carbon footprint, mitigate climate change, and improve air quality. The use of renewable energy also helps i

In [44]:
cleaned_text_doc_1 = cleaned_and_normalized_text(text_content_doc_1)  # Step 0 Preprocessing, Step 1 cleaning and step 2  normalizing 
lemmatized_tokens_doc_1 = preprocess_text(cleaned_text_doc_1)  # Passing the cleaned and normalized text to the function (Step 3 and 4)
unique_words_doc_1  = stop_word_removel_and_get_unique_words(lemmatized_tokens_doc_1)  # Step 5 and 6
all_meaningful_words_doc_1  = get_final_meaningful_words(unique_words_doc_1)  # step 7 

Step 3: Tokens: ['the', 'importance', 'of', 'renewable', 'energy', 'title', 'the', 'importance', 'of', 'renewable', 'energy', 'introduction', 'renewable', 'energy', 'has', 'become', 'an', 'increasingly', 'important', 'topic', 'as', 'the', 'world', 'confronts', 'the', 'challenges', 'of', 'climate', 'change', 'environmental', 'degradation', 'and', 'the', 'limited', 'availability', 'of', 'fossil', 'fuels', 'the', 'shift', 'towards', 'renewable', 'energy', 'sources', 'is', 'crucial', 'for', 'a', 'sustainable', 'and', 'secure', 'energy', 'future', 'this', 'document', 'will', 'delve', 'into', 'the', 'significance', 'of', 'renewable', 'energy', 'and', 'its', 'impact', 'on', 'the', 'environment', 'economy', 'and', 'energy', 'security', 'environmental', 'impact', 'one', 'of', 'the', 'most', 'compelling', 'reasons', 'for', 'the', 'importance', 'of', 'renewable', 'energy', 'is', 'its', 'positive', 'impact', 'on', 'the', 'environment', 'unlike', 'fossil', 'fuels', 'renewable', 'energy', 'sources',

# All Processing on document 2 

In [45]:
text_content_doc_2 = read_document(generated_documents[1])

Document Text: The future of artificial intelligence
Title: The Future of Artificial Intelligence

Introduction
Artificial intelligence (AI) has made significant advancements in recent years, and its impact on various industries and aspects of our daily lives is profound. As we look to the future, the potential for AI to continue transforming the way we work, communicate, and innovate is immense. This document explores the exciting possibilities and potential challenges of the future of artificial intelligence.

AI in Healthcare
One of the most promising areas for the future of AI is in healthcare. AI-powered technologies have the potential to revolutionize medical diagnosis, treatment, and patient care. From predictive analytics to personalized medicine, AI is expected to play a crucial role in improving patient outcomes and reducing healthcare costs.

AI in Transportation
The future of transportation is also expected to be heavily influenced by AI. Autonomous vehicles, powered by AI 

In [46]:
cleaned_text_doc_2 = cleaned_and_normalized_text(text_content_doc_2)  # Step 0 Preprocessing, Step 1 cleaning and step 2  normalizing 
lemmatized_tokens_doc_2 = preprocess_text(cleaned_text_doc_2)  # Passing the cleaned and normalized text to the function (Step 3 and 4)
unique_words_doc_2  = stop_word_removel_and_get_unique_words(lemmatized_tokens_doc_2) # Step 5 and 6
all_meaningful_words_doc_2  = get_final_meaningful_words(unique_words_doc_2)  # step 7 

Step 3: Tokens: ['the', 'future', 'of', 'artificial', 'intelligence', 'title', 'the', 'future', 'of', 'artificial', 'intelligence', 'introduction', 'artificial', 'intelligence', 'ai', 'has', 'made', 'significant', 'advancements', 'in', 'recent', 'years', 'and', 'its', 'impact', 'on', 'various', 'industries', 'and', 'aspects', 'of', 'our', 'daily', 'lives', 'is', 'profound', 'as', 'we', 'look', 'to', 'the', 'future', 'the', 'potential', 'for', 'ai', 'to', 'continue', 'transforming', 'the', 'way', 'we', 'work', 'communicate', 'and', 'innovate', 'is', 'immense', 'this', 'document', 'explores', 'the', 'exciting', 'possibilities', 'and', 'potential', 'challenges', 'of', 'the', 'future', 'of', 'artificial', 'intelligence', 'ai', 'in', 'healthcare', 'one', 'of', 'the', 'most', 'promising', 'areas', 'for', 'the', 'future', 'of', 'ai', 'is', 'in', 'healthcare', 'aipowered', 'technologies', 'have', 'the', 'potential', 'to', 'revolutionize', 'medical', 'diagnosis', 'treatment', 'and', 'patient', 

# All Processing on document 3

In [47]:
text_content_doc_3 = read_document(generated_documents[2])

Document Text: The impact of climate change on biodiversity
The Impact of Climate Change on Biodiversity

Introduction
Climate change has become a pressing issue in recent years, with significant impacts on various aspects of the environment. One of the most affected areas is biodiversity. Biodiversity refers to the variety of life on Earth, including the different species of plants, animals, and microorganisms, as well as the ecosystems in which they live. Climate change has had a profound impact on biodiversity, leading to widespread changes in ecosystems and threatening the survival of many species. This document explores the various ways in which climate change is affecting biodiversity and the potential consequences for the planet.

Habitat Loss and Fragmentation
One of the most direct impacts of climate change on biodiversity is the loss and fragmentation of habitats. As temperatures rise and weather patterns become more erratic, many species are finding it increasingly difficult

In [48]:
cleaned_text_doc_3 = cleaned_and_normalized_text(text_content_doc_3)  # Step 0 Preprocessing, Step 1 cleaning and step 2  normalizing 
lemmatized_tokens_doc_3 = preprocess_text(cleaned_text_doc_3)  # Passing the cleaned and normalized text to the function (Step 3 and 4)
unique_words_doc_3  = stop_word_removel_and_get_unique_words(lemmatized_tokens_doc_3) # Step 5 and 6
all_meaningful_words_doc_3  = get_final_meaningful_words(unique_words_doc_3)  # step 7 

Step 3: Tokens: ['the', 'impact', 'of', 'climate', 'change', 'on', 'biodiversity', 'the', 'impact', 'of', 'climate', 'change', 'on', 'biodiversity', 'introduction', 'climate', 'change', 'has', 'become', 'a', 'pressing', 'issue', 'in', 'recent', 'years', 'with', 'significant', 'impacts', 'on', 'various', 'aspects', 'of', 'the', 'environment', 'one', 'of', 'the', 'most', 'affected', 'areas', 'is', 'biodiversity', 'biodiversity', 'refers', 'to', 'the', 'variety', 'of', 'life', 'on', 'earth', 'including', 'the', 'different', 'species', 'of', 'plants', 'animals', 'and', 'microorganisms', 'as', 'well', 'as', 'the', 'ecosystems', 'in', 'which', 'they', 'live', 'climate', 'change', 'has', 'had', 'a', 'profound', 'impact', 'on', 'biodiversity', 'leading', 'to', 'widespread', 'changes', 'in', 'ecosystems', 'and', 'threatening', 'the', 'survival', 'of', 'many', 'species', 'this', 'document', 'explores', 'the', 'various', 'ways', 'in', 'which', 'climate', 'change', 'is', 'affecting', 'biodiversity

In [49]:
all_meaningful_words_of_all_docs = []
all_meaningful_words_of_all_docs.extend(all_meaningful_words_doc_1)
all_meaningful_words_of_all_docs.extend(all_meaningful_words_doc_2)
all_meaningful_words_of_all_docs.extend(all_meaningful_words_doc_3)

In [51]:
all_meaningful_words_of_all_docs

['sustainable future reason system moreover environmental reduce benefit offer strategic creation become overstate replenish rooftop risk stimulate topic air natural unlike industry continue additionally global availability security virtually mix significance generate planet mitigate impact advancement research role growth import one investment stability deployment supply moral available pollutant substantial compelling positive minimizes embrace source biomass sustainability numerous emission achieve renewable obligation production introduction important improve imperative dependency resilient number decentralize transition degradation challenge harness fossil title opportunity reliable household cleaner pollution potential prioritize ecosystem quality provide job fuel solution country technology reduces foster gas demand furthermore panel market generation preserve becomes increasingly drive widely safeguard use deplete naturally help individual greenhouse produce option shift harmfu

# TFIDF (Built-In)

In [52]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF features
tfidf_features_builtin = tfidf_vectorizer.fit_transform(all_meaningful_words_of_all_docs)

# Print TF-IDF features
print("\nTF-IDF Features:")
feature_names_builtin = tfidf_vectorizer.get_feature_names_out()
for doc_idx, term_idx in zip(*tfidf_features_builtin.nonzero()):
    print(f"({doc_idx}, {feature_names_builtin[term_idx]})\t{tfidf_features_builtin[doc_idx, term_idx]}")

# Print all meaningful words
print("\nAll Meaningful Words:")
print(all_meaningful_words_of_all_docs)


TF-IDF Features:
(0, build)	0.08186199898248973
(0, crucial)	0.048349015326249974
(0, delve)	0.08186199898248973
(0, pave)	0.08186199898248973
(0, also)	0.048349015326249974
(0, footprint)	0.08186199898248973
(0, environment)	0.06225816025630848
(0, operation)	0.06225816025630848
(0, world)	0.06225816025630848
(0, resource)	0.06225816025630848
(0, price)	0.08186199898248973
(0, innovation)	0.08186199898248973
(0, change)	0.06225816025630848
(0, infrastructure)	0.08186199898248973
(0, enhances)	0.06225816025630848
(0, habitat)	0.06225816025630848
(0, make)	0.048349015326249974
(0, climate)	0.06225816025630848
(0, development)	0.06225816025630848
(0, storage)	0.08186199898248973
(0, fluctuation)	0.08186199898248973
(0, play)	0.048349015326249974
(0, conclusion)	0.048349015326249974
(0, far)	0.06225816025630848
(0, viable)	0.08186199898248973
(0, limited)	0.06225816025630848
(0, sensitive)	0.08186199898248973
(0, economic)	0.08186199898248973
(0, mining)	0.08186199898248973
(0, wind)	0.0

# TFIDF (From Scratch)

In [53]:
def calculate_tf(term, document):
    term_frequency = document.count(term)
    return term_frequency

In [54]:
def calculate_idf(term, documents):
    num_documents = len(documents)
    num_documents_with_term = sum(1 for doc in documents if term in doc)
    return math.log((1 + num_documents) / (1 + num_documents_with_term)) + 1

In [55]:
def calculate_tfidf(term, document, documents):
    tf = calculate_tf(term, document)
    idf = calculate_idf(term, documents)
    return tf * idf

In [56]:
def calculate_tfidf_for_corpus(documents):
    tfidf_scores = defaultdict(dict)
    for i, document in enumerate(documents):
        for term in set(document.split()):  # Consider unique words only
            tfidf_scores[i][term] = calculate_tfidf(term, document, documents)
    
    # Second normalization
    for doc_id, scores in tfidf_scores.items():
        sum_of_squares = sum(score ** 2 for score in scores.values())
        norm = np.sqrt(sum_of_squares)
        for term in scores:
            tfidf_scores[doc_id][term] /= norm
    
    return tfidf_scores

In [58]:
# Calculate TF-IDF scores with normalization
tfidf_scores_custom = calculate_tfidf_for_corpus(all_meaningful_words_of_all_docs)

# Print TF-IDF Features
print("TF-IDF Features:")
for doc_idx, scores in tfidf_scores_custom.items():
    for term, score in scores.items():
        print(f"({doc_idx}, {term})\t{score}")

TF-IDF Features:
(0, sustainable)	0.07508156344755704
(0, future)	0.04434438087226806
(0, reason)	0.07508156344755704
(0, system)	0.11420292852638649
(0, moreover)	0.07508156344755704
(0, environmental)	0.04434438087226806
(0, reduce)	0.11420292852638649
(0, benefit)	0.05710146426319324
(0, offer)	0.07508156344755704
(0, strategic)	0.07508156344755704
(0, creation)	0.07508156344755704
(0, become)	0.11420292852638649
(0, overstate)	0.07508156344755704
(0, replenish)	0.07508156344755704
(0, rooftop)	0.07508156344755704
(0, risk)	0.05710146426319324
(0, stimulate)	0.07508156344755704
(0, topic)	0.07508156344755704
(0, air)	0.07508156344755704
(0, natural)	0.11420292852638649
(0, unlike)	0.07508156344755704
(0, industry)	0.05710146426319324
(0, continue)	0.05710146426319324
(0, additionally)	0.07508156344755704
(0, global)	0.07508156344755704
(0, availability)	0.05710146426319324
(0, security)	0.07508156344755704
(0, virtually)	0.07508156344755704
(0, mix)	0.07508156344755704
(0, significa

In [61]:
# Initialize a list to store differences
differences = []

# Print the differences
print("Differences in TF-IDF Scores:")
for doc_idx, scores in tfidf_scores_custom.items():
    for term, score in scores.items():
        # Check if the term is in built-in TF-IDF scores
        term_idx = np.where(feature_names_builtin == term)[0]
        if len(term_idx) > 0:
            term_idx = term_idx[0]
            builtin_score = tfidf_features_builtin[doc_idx, term_idx]
            diff = abs(score - builtin_score)
            differences.append([doc_idx, term, diff])

# Print the table
print(tabulate(differences, headers=["Document Index", "Term", "Difference"], tablefmt="grid", colalign=("center", "center", "center")))


Differences in TF-IDF Scores:
+------------------+-----------------+--------------+
|  Document Index  |      Term       |  Difference  |
|        0         |   sustainable   |  0.00678044  |
+------------------+-----------------+--------------+
|        0         |     future      |  0.00400463  |
+------------------+-----------------+--------------+
|        0         |     reason      |  0.00678044  |
+------------------+-----------------+--------------+
|        0         |     system      |  0.0323409   |
+------------------+-----------------+--------------+
|        0         |    moreover     |  0.00678044  |
+------------------+-----------------+--------------+
|        0         |  environmental  |  0.00400463  |
+------------------+-----------------+--------------+
|        0         |     reduce      |  0.0519448   |
+------------------+-----------------+--------------+
|        0         |     benefit     |  0.0051567   |
+------------------+-----------------+--------------