<a href="https://colab.research.google.com/github/harshith-munakala/PDF-_SUMMARY/blob/main/PDF_SUMMARY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import required libraries
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)  # Initialize PDF reader
        text = ''
        for page in reader.pages:
            text += page.extract_text()  # Extract text from each page
        return text

# Function to clean and tokenize text
def clean_text(text):
    tokens = word_tokenize(text.lower())  # Convert text to lowercase and tokenize into words
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    # Keep only alphabetic tokens and remove stopwords
    cleaned = [word for word in tokens if word.isalpha() and word not in stop_words]
    return set(cleaned)  # Convert list to set for comparison

# Load and extract text from resume and job description PDFs
resume_text = extract_text_from_pdf('//22981A42B0_HARSHITH MUNAKALA (1) (2).pdf')
jd_text = extract_text_from_pdf('//JD.pdf')

# Clean and extract keywords from both documents
resume_words = clean_text(resume_text)
jd_words = clean_text(jd_text)

# Compare keywords to find matched and missing skills
matched_skills = resume_words & jd_words  # Set intersection: common skills
missing_skills = jd_words - resume_words  # Set difference: skills required but not in resume

# Calculate the match percentage
match_percentage = round(len(matched_skills) / len(jd_words) * 100, 2) if jd_words else 0

# Print the final skill match report
print("📄 Resume-JD Skill Match Report")
print("=" * 40)
print(f"✅ Matched Skills: {matched_skills}")
print(f"❌ Missing Skills: {missing_skills}")
print(f"📊 Match Percentage: {match_percentage}%")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


📄 Resume-JD Skill Match Report
✅ Matched Skills: {'python', 'solutions', 'pvt', 'databases', 'developer', 'github', 'strong', 'developers', 'development', 'basic', 'using', 'code', 'communication', 'like', 'data', 'skills', 'applications'}
❌ Missing Skills: {'django', 'india', 'mysql', 'relational', 'written', 'implement', 'hyderabad', 'aws', 'elements', 'design', 'git', 'docker', 'scalability', 'scalable', 'working', 'qualifications', 'flask', 'cloud', 'use', 'company', 'responsibilities', 'good', 'apis', 'backend', 'deployments', 'responsible', 'gcp', 'integrate', 'speed', 'proficiency', 'roles', 'knowledge', 'security', 'platforms', 'description', 'ensure', 'postgresql', 'interchange', 'excellent', 'familiarity', 'experience', 'location', 'users', 'restful', 'job', 'required', 'verbal', 'reusable', 'understanding', 'looking', 'title', 'agile', 'collaborate', 'future', 'optimize', 'technova', 'web', 'ltd', 'preferred', 'build', 'libraries', 'maximum', 'processes', 'protection', 'serv

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import PyPDF2
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK tokenizer
nltk.download('punkt')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

# Function to summarize text using TF-IDF + cosine similarity
def summarize_text(text, num_sentences=5):
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text

    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Compute similarity scores with the first sentence
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix).flatten()

    # Get top-ranked sentences (excluding the first one)
    top_indices = similarity_scores.argsort()[-num_sentences:]
    top_indices.sort()  # Preserve original order

    # Combine top sentences
    summary = ' '.join([sentences[i] for i in top_indices])
    return summary

# Main execution
    # Replace this with the path to your PDF
pdf_path = "//22981A42B0_HARSHITH MUNAKALA (1) (2).pdf"

    # Step 1: Extract text
text = extract_text_from_pdf(pdf_path)

    # Step 2: Summarize text
summary = summarize_text(text, num_sentences=5)

    # Step 3: Display summary
print("\n=== PDF Summary ===\n")
print(summary)


=== PDF Summary ===

HARSHITH  MUNAKALA  
 
Visakhapatnam, Andhra Pradesh | harshaasharshith@gmail.com  |+91-
9494444365|  www.linkedin.com/in/harshith -munakala -65149b31a  
 
PROFESSIONAL SUMMARY  
 
Motivated  undergraduate  pursuing  a Bachelor's  in Computer  Science  and Engineering  (CSE  
- AI & ML). EDUCATION  
Raghu  Engineering  College  2022 -Present  
B.Tech  in Computer  Science  (AI & ML) 
Visakhapatnam, Andhra Pradesh  
CGPA:  9.42 
Sri Chaitanya  Junior  College  2020 -2022  
Intermediate MPC 
Visakhapatnam,  Andhra Pradesh 
98% 
SKILLS  
TECHNICAL  OTHERS  
Front -End:  HTML, CSS. NPTEL  Certification  Course  in Programming  in C – October  2023. GitHub: https://github.com/harshith -munakala/password_generator  
2. GitHub: https://github.com/harshith -munakala/qr -code -generator


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# New Section

In [None]:
pip install PyPDF2 nltk


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True