In [None]:
!pip install -q transformers sentencepiece wikipedia-api faiss-cpu sentence-transformers pymupdf

import re
import torch
import wikipediaapi
import fitz
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from google.colab import files


In [None]:
lt5_tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
lt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/long-t5-local-base")

def long_summarize_chunked(text, max_len=200):
    words = text.split()
    chunk_size = 1000
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    partial_summaries = []
    for chunk in chunks:
        inputs = lt5_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=16384)
        summary_ids = lt5_model.generate(inputs["input_ids"], max_length=max_len, min_length=50)
        summary = lt5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        if summary.strip():
            partial_summaries.append(summary)
    if partial_summaries:
        return " ".join(partial_summaries)
    else:
        fallback = " ".join(words[:200]) + ("..." if len(words) > 200 else "")
        return fallback


Some weights of LongT5ForConditionalGeneration were not initialized from the model checkpoint at google/long-t5-local-base and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])


In [None]:
def clean_cv_text_keep_links(text):
    lines = text.splitlines()
    filtered_lines = []
    for line in lines:
        if re.search(r'\S+@\S+\.\S+', line):
            continue
        if re.search(r'\b\d{7,}\b', line):
            continue
        filtered_lines.append(line)
    return "\n".join(filtered_lines)


In [None]:
def extract_links(text):
    urls = re.findall(r'(https?://[^\s]+)', text)
    return list(set(urls))


In [None]:
wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='AgenticAI-Demo/1.0 (Contact: example@email.com)'
)


In [None]:
dimension = 384
index = faiss.IndexFlatL2(dimension)
memory_texts = []

embed_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embed_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def embed(text):
    inputs = embed_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        return embed_model(**inputs).last_hidden_state.mean(dim=1).numpy()


In [None]:
def planner_agent(command):
    tasks = []
    cmd = command.lower()
    if "pdf" in cmd:
        tasks.append("pdf_extract")
    if "summarize" in cmd:
        tasks.append("summarize")
    if "research" in cmd:
        tasks.append("research")
    if "memory" in cmd:
        tasks.append("recall_memory")
    if "bio" in cmd:
        tasks.append("generate_bio")
    return tasks


In [None]:
def reflection_agent(text):
    if len(text.split()) < 20:
        return text + " (Expanded for clarity.)"
    elif len(text.split()) > 100:
        return " ".join(text.split()[:100]) + " ..."
    return text


In [None]:
def cv_summary_generator(cv_text, links=None):
    lines = cv_text.splitlines()
    sections = {"skills": [], "education": [], "projects": [], "experience": []}
    current_section = None

    section_headers = {
        "skills": ["skills", "technical skills", "skills and abilities"],
        "education": ["education", "bachelor", "master", "degree"],
        "projects": ["projects", "research projects", "work done"],
        "experience": ["experience", "internship", "work experience", "professional experience"]
    }

    for line in lines:
        l = line.strip().lower()
        matched = False
        for sec, headers in section_headers.items():
            if any(l == h for h in headers):
                current_section = sec
                matched = True
                break
        if matched:
            continue
        if current_section and line.strip():
            sections[current_section].append(line.strip())

    summary = "Professional Summary:\n"
    if sections["experience"]:
        summary += "- Experience: " + ", ".join(sections["experience"][:3]) + "\n"
    if sections["projects"]:
        summary += "- Key Projects: " + ", ".join(sections["projects"][:3]) + "\n"
    if sections["skills"]:
        summary += "- Skills: " + ", ".join(sections["skills"][:7]) + "\n"
    if sections["education"]:
        summary += "- Education: " + ", ".join(sections["education"][:2]) + "\n"

    if links:
        summary += "\nContact & Links:\n"
        for link in links:
            summary += f"- {link}\n"

    return summary.strip()


In [None]:
def summary_variants_generator(cv_text, links):
    concise = long_summarize_chunked(cv_text, max_len=80)
    structured = cv_summary_generator(cv_text, links)
    linkedin_prompt = "Create a LinkedIn-style professional summary from this CV text: " + cv_text[:3000]
    linkedin = long_summarize_chunked(linkedin_prompt, max_len=120)
    return concise, structured, linkedin


In [None]:
def executor_agent(task, content=None, pdf_path=None, topic=None):
    if task == "pdf_extract":
        return extract_pdf_text(pdf_path)
    elif task == "research":
        page = wiki.page(topic)
        return page.summary if page.exists() else "No information found."
    elif task == "recall_memory":
        if len(memory_texts) == 0:
            return "No memory stored yet."
        query_vec = embed(content)
        D, I = index.search(query_vec, k=1)
        return memory_texts[I[0][0]]
    elif task == "generate_bio":
        return "AI & Data Science professional skilled in NLP, ML, and Agentic AI systems with expertise in automation, research, and intelligent solutions."
    return "Unknown task"


In [None]:
def save_text_to_file(text, filename):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Saved output to {filename}")


In [None]:
def agentic_ai(command, pdf_path=None):
    print("USER COMMAND:")
    print(command)

    tasks = planner_agent(command)
    print("Planned Tasks:", tasks)

    raw_text = ""
    content = ""
    final_outputs = {}

    if "pdf_extract" in tasks and pdf_path:
        print("Extracting PDF text...")
        raw_text = executor_agent("pdf_extract", pdf_path=pdf_path)
        content = clean_cv_text_keep_links(raw_text)
        links = extract_links(raw_text)
        print("Cleaned CV Text Preview:", content[:500])
        print("Extracted Links:", links)
    else:
        links = []

    if "research" in tasks:
        topic = command.split("about")[-1].strip()
        print("Researching topic:", topic)
        content = executor_agent("research", topic=topic)

    if "summarize" in tasks:
        if not content or len(content.split()) < 10:
            print("PDF extraction returned insufficient text.")
        else:
            print("Generating multiple CV summary options...")
            concise, structured, linkedin = summary_variants_generator(content, links)

            print("Option 1: Concise Summary:\n", concise)
            print("Option 2: Structured Summary:\n", structured)
            print("Option 3: LinkedIn-style Summary:\n", linkedin)

            choice = input("Enter choice (1=Concise, 2=Structured, 3=LinkedIn): ").strip()
            final_choice = {"1": concise, "2": structured, "3": linkedin}.get(choice, structured)

            final_outputs["summary"] = final_choice
            memory_texts.append(final_choice)
            index.add(embed(final_choice))

            save_text_to_file(final_choice, "CV_Summary.txt")

    if "recall_memory" in tasks:
        print("Recalling from memory...")
        recalled = executor_agent("recall_memory", content=command)
        final_outputs["memory"] = recalled

    if "generate_bio" in tasks:
        print("Generating professional bio...")
        bio = executor_agent("generate_bio")
        final_outputs["bio"] = bio
        save_text_to_file(bio, "CV_Bio.txt")

    print("Final Output:")
    for k, v in final_outputs.items():
        print(f"{k.upper()}:\n{v}\n")


In [None]:
print("Upload your CV PDF file:")
uploaded = files.upload()
cv_file = list(uploaded.keys())[0]

agentic_ai("Summarize my CV from pdf", pdf_path=cv_file)
agentic_ai("Generate bio from my CV pdf", pdf_path=cv_file)
agentic_ai("Recall previous memory summary")


Upload your CV PDF file:


Saving Md__Jamil_Khan_CV_ (1).pdf to Md__Jamil_Khan_CV_ (1) (8).pdf
USER COMMAND:
Summarize my CV from pdf
Planned Tasks: ['pdf_extract', 'summarize']
Extracting PDF text...
Cleaned CV Text Preview: MD. JAMIL KHAN
Junior Data Scientist & AI Developer
¯ mdjamilkhan
 jamilEmon
 Portfolio
 Dhaka, Bangladesh
Career Objective
Motivated and results-driven Junior Data Scientist and AI Developer with a strong academic background and
published research in artificial intelligence. Committed to leveraging advanced analytical techniques, machine
learning, and deep learning methodologies to solve complex real-world challenges. Seeking a dynamic and
growth-oriented position where I can contribute to i
Extracted Links: []
Generating multiple CV summary options...
Option 1: Concise Summary:
 MD. JAMIL KHAN Junior Data Scientist & AI Developer ¯ mdjamilkhan  jamilEmon  Portfolio  Dhaka, Bangladesh Career Objective Motivated and results-driven Junior Data Scientist and AI Developer with a strong a