In [None]:
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
#from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import spacy

def ensure_nltk_resources():
    required = [
        'punkt',
        'punkt_tab',
        'averaged_perceptron_tagger',
        'averaged_perceptron_tagger_eng',
        'words',
        'wordnet',
        'stopwords',
        'omw-1.4',
        'maxent_ne_chunker',
        'maxent_ne_chunker_tab'
    ]
    for resource in required:
        try:
            nltk.data.find(f'{resource}')
        except LookupError:
            print(f"Downloading NLTK resource: {resource}...")
            nltk.download(resource)
ensure_nltk_resources()

In [2]:
docs = [
    "Natural language processing with TF-IDF is powerful. It extracts important keywords from text.",
    "TF-IDF helps in identifying significant words. Keyword extraction using NLP techniques is common.",
    "Apple Inc. is planning to open a new office in London. Tim Cook will attend the opening ceremony in July 2025."
]

In [3]:
stop_words = set(stopwords.words('english'))
#stemmer = PorterStemmer() #lemmatizer is used
lemmatizer = WordNetLemmatizer()
nlpNER = spacy.load("en_core_web_sm")# for named entity 

In [5]:

# POS tag conversion for lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def preprocess_Seg_Token_POS(doc):
    sentences = sent_tokenize(doc) #Segmentation
    all_words = []
    for sentence in sentences:
        sentence = sentence.lower().replace(".", "") 
        tokens = word_tokenize(sentence) #Tokenize
        tagged = pos_tag(tokens) #POS_PARSING
        all_words.append(tagged)
    return all_words
def preprocess_stop_words_lemmatizer(doc):
    all_words = []
    for tagged in doc:
        for word, tag in tagged:
            #Removing Stop Words
            if word.isalpha() and word not in stop_words:
                #STEMMING or lemmating 
                lemma = lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                all_words.append(lemma)
    return all_words
def preprocess_doc(doc):
    seg_Token_POS = preprocess_Seg_Token_POS(doc)
    stop_words_lemmatizer = preprocess_stop_words_lemmatizer(seg_Token_POS)
    return ' '.join(stop_words_lemmatizer)
def preprocess(doc):
    sentences = sent_tokenize(doc) #Segmentation
    all_words = []
    for sentence in sentences:
        sentence = sentence.lower().replace(".", "") 
        tokens = word_tokenize(sentence) #Tokenize
        tagged = pos_tag(tokens) #POS_PARSING
        for word, tag in tagged:
            #Removing Stop Words
            if word.isalpha() and word not in stop_words:
                #STEMMING or lemmating 
                lemma = lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                all_words.append(lemma)
    return ' '.join(all_words)

In [None]:
# Preprocess documents
#processed_docs = [preprocess(doc) for doc in docs]
seg_Token_POSs = [preprocess_Seg_Token_POS(doc) for doc in docs]

stop_words_lemmatizers = [preprocess_stop_words_lemmatizer(doc) for doc in seg_Token_POSs]
processed_docs = [' '.join(doc) for doc in stop_words_lemmatizers]

# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_docs)
feature_names = vectorizer.get_feature_names_out()
df = pd.DataFrame(tfidf_matrix.T.toarray(), index=feature_names, columns=[f'Doc{i+1}' for i in range(len(docs))])
posArr = np.empty(len(feature_names), dtype=f'<U{len(feature_names)}')
for i in range(len(posArr)):
    for poss in seg_Token_POSs:
        for pos in poss:
            for word, tag in pos:
                if(word == feature_names[i]):
                    posArr[i] = tag

df['POS'] = posArr
df.head()
for i in range(len(docs)):
    print(f"\nTop keywords in Document {i+1}:")
    top_keywords = df[f'Doc{i+1}'].sort_values(ascending=False).head(5)
    print(top_keywords)


Top keywords in Document 1:
extract       0.353553
language      0.353553
important     0.353553
keywords      0.353553
processing    0.353553
Name: Doc1, dtype: float64

Top keywords in Document 2:
common        0.316228
extraction    0.316228
help          0.316228
identify      0.316228
keyword       0.316228
Name: Doc2, dtype: float64

Top keywords in Document 3:
apple       0.27735
attend      0.27735
ceremony    0.27735
cook        0.27735
inc         0.27735
Name: Doc3, dtype: float64


In [7]:
print("🔑 TF-IDF Keywords:")
for i, sentence in enumerate(processed_docs):
    print(f"\nSentence {i+1}:")
    scores = tfidf_matrix[i].toarray()[0]
    sorted_indices = np.argsort(scores)[::-1]
    for idx in sorted_indices[:]: 
        if scores[idx] > 0:
            print(f"{feature_names[idx]}: {scores[idx]:.4f}")

🔑 TF-IDF Keywords:

Sentence 1:
text: 0.3536
processing: 0.3536
powerful: 0.3536
natural: 0.3536
important: 0.3536
keywords: 0.3536
language: 0.3536
extract: 0.3536

Sentence 2:
word: 0.3162
use: 0.3162
technique: 0.3162
significant: 0.3162
nlp: 0.3162
identify: 0.3162
keyword: 0.3162
help: 0.3162
extraction: 0.3162
common: 0.3162

Sentence 3:
tim: 0.2774
opening: 0.2774
new: 0.2774
office: 0.2774
open: 0.2774
plan: 0.2774
london: 0.2774
cook: 0.2774
apple: 0.2774
attend: 0.2774
ceremony: 0.2774
july: 0.2774
inc: 0.2774


In [8]:
print("\n🏷 Named Entities:")
named = [nlpNER(doc) for doc in docs]
for elm in named:
    for ent in elm:
        print(f"{ent.text}: {ent.pos_}")
    for ent in elm.ents:
        print(f"{ent.text}: {ent.label_}")


🏷 Named Entities:
Natural: ADJ
language: NOUN
processing: NOUN
with: ADP
TF: PROPN
-: PUNCT
IDF: PROPN
is: AUX
powerful: ADJ
.: PUNCT
It: PRON
extracts: VERB
important: ADJ
keywords: NOUN
from: ADP
text: NOUN
.: PUNCT
TF: PROPN
-: PUNCT
IDF: PROPN
helps: VERB
in: ADP
identifying: VERB
significant: ADJ
words: NOUN
.: PUNCT
Keyword: PROPN
extraction: NOUN
using: VERB
NLP: PROPN
techniques: NOUN
is: AUX
common: ADJ
.: PUNCT
Keyword: PERSON
NLP: ORG
Apple: PROPN
Inc.: PROPN
is: AUX
planning: VERB
to: PART
open: VERB
a: DET
new: ADJ
office: NOUN
in: ADP
London: PROPN
.: PUNCT
Tim: PROPN
Cook: PROPN
will: AUX
attend: VERB
the: DET
opening: NOUN
ceremony: NOUN
in: ADP
July: PROPN
2025: NUM
.: PUNCT
Apple Inc.: ORG
London: GPE
Tim Cook: PERSON
July 2025: DATE


In [14]:
import tkinter as tk
from tkinter import filedialog, scrolledtext, messagebox
import PyPDF2

# Main window
root = tk.Tk()
root.title("Multi PDF Reader")
root.geometry("800x600")

text_display = scrolledtext.ScrolledText(root, wrap=tk.WORD)

In [15]:
def open_pdfs():
    file_paths = filedialog.askopenfilenames(
        title="Select PDF files",
        filetypes=[("PDF Files", "*.pdf")]
    )

    if not file_paths:
        return

    text_display.delete(1.0, tk.END)  # Clear existing text

    for file_path in file_paths:
        try:
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                text_display.insert(tk.END, f"\n--- {file_path} ---\n\n")
                for page in reader.pages:
                    text_display.insert(tk.END, page.extract_text())
                    text_display.insert(tk.END, "\n\n")
        except Exception as e:
            messagebox.showerror("Error", f"Could not read {file_path}\n{e}")

In [16]:


# Button to open PDFs
open_button = tk.Button(root, text="Open PDF Files", command=open_pdfs)
open_button.pack(pady=10)

# Scrollable text box
text_display.pack(fill=tk.BOTH, expand=True)

root.mainloop()