In [28]:
!pip install transformers sentence-transformers PyPDF2 nltk --quiet

In [27]:
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from google.colab import files

In [35]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **Load Models**

In [31]:
print("Loading models... please wait.")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Models loaded successfully!\n")

Loading models... please wait.


Device set to use cpu


✅ Models loaded successfully!



# **Upload and Extract**

In [42]:
print("Upload a document (PDF)...")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

def extract_text_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

document_text = extract_text_from_pdf(pdf_path)
print("\n✅ Document uploaded and extracted successfully!\n")

Upload a document (PDF)...


Saving HR_POLICY_XYZ.pdf to HR_POLICY_XYZ.pdf

✅ Document uploaded and extracted successfully!



# **Summarize the Document**

In [43]:
print("Generating summary... please wait.\n")
chunk = document_text[:3000] if len(document_text) > 3000 else document_text
summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)
print("📄 Summary:\n")
print(summary[0]['summary_text'])

Generating summary... please wait.

📄 Summary:

This document outlines the key policies, procedures, and benefits for all Company XYZ employees. It is your responsibility to read and understand this manual. Policies are subject tochange, and updates will be communicated by the HR department. For any questions, please contact your manager or the Human Resources team.


# **Keyword Extraction**

In [44]:
def extract_keywords(text, num_keywords=10):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    filtered = [w for w in tokens if w not in stop_words and w not in string.punctuation]
    freq = {}
    for word in filtered:
        freq[word] = freq.get(word, 0) + 1
    sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    return [word for word, count in sorted_words[:num_keywords]]

keywords = extract_keywords(document_text)
print("\n🔑 Keywords:", ", ".join(keywords))


🔑 Keywords: 20, page, section, employees, company, may, policy, must, leave, employee


# **Simple Chatbot Loop (Document QA)**

In [60]:
import random

user_name = input("Enter your name 😊 :")

emojis = ["🙂", "😊", "👍", "🙌", "💡", "🤓", "✨", "😄", "🎯", "📝"]
thank_emojis = ["🙏", "💖", "😊", "🌟", "😄"]

def chatbot(query):
    greetings = ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"]
    greeting_responses = ["Hello", "Hi there", "Hey", "Greetings"]

    thanks = ["thanks", "thank you", "good job", "well done", "appreciate"]
    thanks_responses = [
        f"You're welcome {user_name}! {random.choice(thank_emojis)}",
        f"My pleasure, {user_name}! {random.choice(thank_emojis)}",
        f"Happy to help you {user_name}! {random.choice(thank_emojis)}"
    ]

    if any(word in query.lower() for word in greetings):
        return f"{random.choice(greeting_responses)} {user_name}! 👋 How can I help you today? {random.choice(emojis)}"

    if any(word in query.lower() for word in thanks):
        return random.choice(thanks_responses)

    if sentence_embeddings.nelement() == 0:
        return f"Sorry, I couldn't process the document properly. No content found. 😔"

    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]

    top_result = torch.topk(cos_scores, k=1)
    best_sentences = [sentences[idx].strip() for idx in top_result[1]]
    answer = " ".join(best_sentences)

    return f"{answer} {random.choice(emojis)}"

while True:
  q = input(f"\n{user_name}: ")
  if q.lower() in ["exit", "quit", "bye bot", "bye"]:
        print(f"Chatbot: Goodbye {user_name}! 👋 Have a great day! {random.choice(emojis)}")
        break
  ans = chatbot(q)
  print(f"Chatbot: {ans}")


Enter your name 😊 :Krithick 

Krithick : Hello Bot
Chatbot: Hi there Krithick ! 👋 How can I help you today? 😊

Krithick : What is the dress code?
Chatbot: The standard office dress code is business casual. 😊

Krithick : What is leave policy?
Chatbot: We also offer a parental leave policy, which includes 12
weeks of paid leave for the primary caregiver. 🎯

Krithick : How many vacation days do i get?
Chatbot: All full-time employees are
entitled to 20 paid vacation days per year, which are accrued on a monthly basis. 🤓

Krithick : When will i get paid?
Chatbot: All employees are paid bi-weekly on
Fridays. ✨

Krithick : Thanks for the Information ! 
Chatbot: Happy to help you Krithick ! 💖

Krithick : Bye
Chatbot: Goodbye Krithick ! 👋 Have a great day! 😊
