In [32]:
import os
import re
import numpy as np
import pandas as pd
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import PyPDF2
from datetime import datetime
print("Library imported successfully")

Library imported successfully


In [3]:
# CV Parser class
class CVParser:
    """Parse CV content into structured sections"""
    
    def __init__(self):
        self.sections = {}
        self.raw_text = ""
        
    def parse_text(self, text: str):
        """Parse CV text into sections"""
        self.raw_text = text
        self.sections = self._identify_sections(text)
        return self.sections
    
    def _identify_sections(self, text: str) -> Dict[str, str]:
        """Identify and extract CV sections"""
        sections = {
            'contact': '',
            'summary': '',
            'experience': '',
            'education': '',
            'skills': '',
            'projects': '',
            'certifications': '',
            'languages': '',
            'achievements': ''
        }
        
        lines = text.split('\n')
        current_section = None
        buffer = []
        
        # Common section headers
        section_patterns = {
            'contact': ['contact', 'personal', 'details', 'information', 'address', 'phone', 'email'],
            'summary': ['summary', 'objective', 'profile', 'about'],
            'experience': ['experience', 'work', 'employment', 'professional'],
            'education': ['education', 'academic', 'qualifications', 'university', 'college'],
            'skills': ['skills', 'technical', 'competencies', 'expertise'],
            'projects': ['projects', 'portfolio', 'work samples'],
            'certifications': ['certifications', 'certificates', 'licenses', 'courses'],
            'languages': ['languages', 'language'],
            'achievements': ['achievements', 'awards', 'honors', 'publications']
        }
        
        for line in lines:
            line_stripped = line.strip()
            if not line_stripped:
                continue
                
            line_lower = line_stripped.lower()
            
            # Check if this line starts a new section
            section_found = False
            for section, keywords in section_patterns.items():
                if any(keyword in line_lower for keyword in keywords):
                    # Save previous section
                    if current_section and buffer:
                        sections[current_section] = '\n'.join(buffer).strip()
                    
                    # Start new section
                    current_section = section
                    buffer = [line_stripped]
                    section_found = True
                    break
            
            if not section_found and current_section:
                # Continue adding to current section
                buffer.append(line_stripped)
        
        # Save the last section
        if current_section and buffer:
            sections[current_section] = '\n'.join(buffer).strip()
        
        return sections
    
    def display_sections(self):
        """Display parsed sections nicely"""
        display(Markdown("## üìä Parsed CV Sections"))
        
        for section, content in self.sections.items():
            if content:
                display(Markdown(f"### üîπ {section.upper()}"))
                display(Markdown(f"```\n{content[:500]}{'...' if len(content) > 500 else ''}\n```"))

In [4]:
# CV Search engine class

class CVSearchEngine:
    """Search engine for CV content"""
    
    def __init__(self, cv_parser: CVParser):
        self.parser = cv_parser
        self.keyword_index = self._build_index()
        
    def _build_index(self) -> Dict[str, List[str]]:
        """Build keyword index for fast searching"""
        index = {}
        
        for section, content in self.parser.sections.items():
            if content:
                # Extract words (simple tokenization)
                words = re.findall(r'\b\w+\b', content.lower())
                for word in words:
                    if len(word) > 3:  # Only index meaningful words
                        if word not in index:
                            index[word] = []
                        if section not in index[word]:
                            index[word].append(section)
        
        return index
    
    def search(self, query: str, top_k: int = 5) -> Dict[str, Any]:
        """Search CV content"""
        query_lower = query.lower()
        results = {}
        
        # Extract keywords from query
        query_keywords = re.findall(r'\b\w+\b', query_lower)
        
        # Find sections containing query keywords
        relevant_sections = set()
        
        for keyword in query_keywords:
            if keyword in self.keyword_index:
                relevant_sections.update(self.keyword_index[keyword])
        
        # Also check section names
        for section_name in self.parser.sections.keys():
            if any(keyword in section_name for keyword in query_keywords):
                relevant_sections.add(section_name)
        
        # Collect results
        for section in relevant_sections:
            content = self.parser.sections.get(section, '')
            if content:
                # Find matching lines
                lines = content.split('\n')
                matching_lines = []
                
                for line in lines:
                    line_lower = line.lower()
                    if any(keyword in line_lower for keyword in query_keywords):
                        matching_lines.append(line.strip())
                
                if matching_lines:
                    results[section] = {
                        'content': content,
                        'matches': matching_lines[:top_k],
                        'match_count': len(matching_lines)
                    }
        
        return results
    
    def display_search_results(self, query: str):
        """Display search results nicely"""
        results = self.search(query)
        
        if not results:
            display(Markdown(f"### ‚ùå No results found for: '{query}'"))
            return
        
        display(Markdown(f"### üîç Search Results for: '{query}'"))
        display(Markdown(f"**Found {len(results)} relevant section(s)**\n"))
        
        for section, data in results.items():
            display(Markdown(f"#### üìå {section.upper()}"))
            display(Markdown(f"**Matches found:** {data['match_count']}"))
            
            for match in data['matches']:
                display(Markdown(f"- {match}"))
            
            display(Markdown("---"))

In [24]:
# Cv Q&A System

class CVQASystem:
    """Question answering system for CV"""
    
    def __init__(self, cv_parser: CVParser):
        self.qa_system = self
        self.parser = cv_parser
        self.search_engine = CVSearchEngine(cv_parser)
        
        # Question patterns mapping
        self.question_patterns = {
            # Skills questions
            r'what.*skill': 'skills',
            r'list.*skill': 'skills',
            r'what.*technology': 'skills',
            r'what.*programming': 'skills',
            r'technical.*skill': 'skills',
            
            # Experience questions
            r'what.*experience': 'experience',
            r'work.*experience': 'experience',
            r'job.*history': 'experience',
            r'where.*work': 'experience',
            r'which.*company': 'experience',
            r'current.*role': 'experience',
            r'previous.*role': 'experience',
            
            # Education questions
            r'what.*education': 'education',
            r'what.*degree': 'education',
            r'where.*study': 'education',
            r'which.*university': 'education',
            r'graduat': 'education',
            
            # Project questions
            r'what.*project': 'projects',
            r'describe.*project': 'projects',
            r'portfolio': 'projects',
            
            # Certification questions
            r'what.*certif': 'certifications',
            r'certificate': 'certifications',
            r'license': 'certifications',
            
            # Contact questions
            r'what.*contact': 'contact',
            r'email': 'contact',
            r'phone': 'contact',
            r'how.*contact': 'contact',
            
            # Summary questions
            r'what.*summary': 'summary',
            r'professional.*summary': 'summary',
            r'about.*me': 'summary',
            
            # Language questions
            r'what.*language': 'languages',
            r'speak.*language': 'languages',
            
            # Achievement questions
            r'what.*achievement': 'achievements',
            r'award': 'achievements',
            r'accomplishment': 'achievements',
        }
    
    def answer_question(self, question: str) -> str:
        """Answer questions about the CV"""
        # First try direct search
        search_results = self.search_engine.search(question)
        
        if search_results:
            return self._format_answer_from_search(question, search_results)
        
        # Try pattern matching
        question_lower = question.lower()
        for pattern, section in self.question_patterns.items():
            if re.search(pattern, question_lower):
                content = self.parser.sections.get(section, '')
                if content:
                    return self._format_section_answer(section, content)
        
        # Return summary if no specific answer found
        return self.get_cv_summary()
    
    def _format_answer_from_search(self, question: str, search_results: Dict[str, Any]) -> str:
        """Format answer from search results"""
        answer_parts = [f"**Based on your CV:**\n"]
        
        for section, data in search_results.items():
            answer_parts.append(f"**üìå {section.upper()}**")
            
            if data['matches']:
                answer_parts.append("**Relevant information:**")
                for match in data['matches']:
                    answer_parts.append(f"- {match}")
            else:
                # Show preview of section
                preview = data['content']
                if len(preview) > 300:
                    preview = preview[:300] + "..."
                answer_parts.append(preview)
            
            answer_parts.append("")
        
        return "\n".join(answer_parts)
    
    def _format_section_answer(self, section: str, content: str) -> str:
        """Format answer for a specific section"""
        answer = f"**üìÑ {section.upper()}**\n\n"
        
        # Format content nicely
        lines = content.split('\n')
        bullet_points = []
        
        for line in lines:
            if line.strip():
                # Check if line looks like a bullet point
                if any(line.strip().startswith(bullet) for bullet in ['-', '‚Ä¢', '*', '‚Üí']):
                    bullet_points.append(line)
                elif len(line) < 100:  # Short lines might be headers
                    bullet_points.append(f"**{line}**")
                else:
                    bullet_points.append(line)
        
        answer += "\n".join(bullet_points[:15])  # Limit to 15 items
        if len(bullet_points) > 15:
            answer += f"\n\n... and {len(bullet_points) - 15} more items"
        
        return answer
    
    def get_cv_summary(self) -> str:
        """Get comprehensive CV summary"""
        summary_parts = ["## üìä CV SUMMARY\n"]
        
        for section, content in self.parser.sections.items():
            if content:
                summary_parts.append(f"### ‚úÖ {section.upper()}")
                
                # Get first few lines
                lines = content.split('\n')
                for line in lines[:3]:
                    if line.strip():
                        summary_parts.append(f"- {line.strip()}")
                
                if len(lines) > 3:
                    summary_parts.append(f"  ... ({len(lines) - 3} more items)")
                
                summary_parts.append("")
        
        # Add statistics
        filled_sections = sum(1 for content in self.parser.sections.values() if content)
        total_lines = sum(len(content.split('\n')) for content in self.parser.sections.values() if content)
        
        summary_parts.append(f"**üìà Statistics:** {filled_sections} sections, {total_lines} total items")
        
        return "\n".join(summary_parts)
    
    def display_answer(self, question: str):
        """Display answer in a formatted way"""
        answer = self.answer_question(question)
        display(Markdown(f"### ‚ùì Question: {question}"))
        display(Markdown("---"))
        display(Markdown(answer))

In [27]:
# Main CV RAG System Class

class CVRAGSystem:
    """Main CV RAG System"""
    
    def __init__(self):
        self.qa_system = self
        self.parser = CVParser()
        self.qa_system = None
        
    def load_cv(self, cv_text: str):
        """Load and parse CV text"""
        self.parser.parse_text(cv_text)
        self.qa_system = CVQASystem(self.parser)
        print(f"‚úÖ CV loaded successfully!")
        print(f"üìä Sections found: {[s for s, c in self.parser.sections.items() if c]}")
    
    def interactive_mode(self):
        """Run interactive mode"""
        display(Markdown("# üéØ CV RAG SYSTEM"))
        display(Markdown("### Interactive Analysis Mode"))
        
        while True:
            display(Markdown("---"))
            
            print("\nOptions:")
            print("1. Ask a question about CV")
            print("2. Search for keywords")
            print("3. View CV summary")
            print("4. View parsed sections")
            print("5. Exit")
            
            choice = input("\nEnter choice (1-5): ").strip()
            
            if choice == "1":
                question = input("\n‚ùì Enter your question: ").strip()
                if question:
                    self.qa_system.display_answer(question)
                    
            elif choice == "2":
                query = input("\nüîç Enter search query: ").strip()
                if query:
                    self.qa_system.search_engine.display_search_results(query)
                    
            elif choice == "3":
                display(Markdown(self.qa_system.get_cv_summary()))
                
            elif choice == "4":
                self.parser.display_sections()
                
            elif choice == "5":
                print("\nüëã Goodbye!")
                break
                
            else:
                print("‚ùå Invalid choice. Please try again.")

In [28]:
# Sample CV Data


# Sample CV for testing (you can replace this with your own CV)
SAMPLE_CV = """Halimat Sadiat Ibrahim - Akinoso DEVELOPER
Ogun, Nigeria | +234 8056159448  +2348027106988  | halimahakin2@gmail.com
LinkedIn: linkedin.com/in/halimah-akinoso/ | GitHub: github.com/halimahAkinoso/

PROFESSIONAL SUMMARY
Detail-oriented and innovative Software Engineer with 3+ years of experience designing and implementing web-based  applications using modern technologies such as JavaScript, Python, React, and Node.js. Recognized for writing clean, maintainable code and delivering features that enhance user experience and system efficiency. Seeking to learn and contribute to high-impact software projects in a collaborative team environment.

WORK EXPERIENCE
Internship
Edu Tam Ltd
October 2024 ‚Äì Present
Built and maintained scalable web applications using React and Node.js for over 10 enterprise clients.
Led migration of legacy codebase to modern JavaScript frameworks, resulting in a 30% performance improvement.
Developed secure RESTful APIs integrated with third-party payment platforms (e.g., Paystack, Flutterwave).
Participated in code reviews, sprint planning, and continuous integration using Jenkins and Docker.

EDUCATION
B.Sc. Computer Science and Education
Olabisi Onabanjo University Ago-iwoye,Ogun state, Nigeria
2004 ‚Äì 2008
ÔÇ∑
Final Year Project: ‚ÄúA Predictive Model for University Admissions Using Machine Learning‚Äù
ÔÇ∑


TECHNICAL SKILLS
Languages: JavaScript, Python, Java
Frontend: React.js, Redux, HTML5, CSS3, Tailwind CSS
Backend: Node.js, , Django, REST APIs
Databases: MySQL
DevOps/Tools: Git, GitHub, Docker, Jenkins, CI/CD
Methodologies: Agile (Scrum), TDD, MVC architecture

PROJECTS
E-Commerce Platform (2022)
- Full-stack development of an online shopping platform with 10,000+ products
- Integrated Stripe payment gateway and inventory management system
- Implemented Redis caching that reduced page load time by 60%
- Technologies: Python, Django, React, PostgreSQL, Redis, AWS

Task Management Application (2021)
- React-based application for team task tracking and project management
- Real-time updates using WebSockets and notifications system
- Used by 50+ team members across multiple departments
- Technologies: React, Node.js, Socket.io, MongoDB

CERTIFICATIONS
AWS Certified Developer - Associate | Amazon Web Services | 2022
Google Cloud Associate Cloud Engineer | Google Cloud | 2021
Scrum Master Certified (SMC) | Scrum.org | 2020

LANGUAGES
Yoruba (Native)
English(Professional Working Proficiency)
Sign language (Limited Working Proficiency)

ACHIEVEMENTS
Employee of the Year 2021 | Tech Solutions Inc.
Best Project Award 2020 | Startup Labs
Published paper on "Optimizing Web Applications" in Tech Journal 2019
"""

print("‚úÖ Sample CV loaded (you can replace with your own)")
print(f"Length: {len(SAMPLE_CV)} characters")

‚úÖ Sample CV loaded (you can replace with your own)
Length: 2664 characters


In [29]:
# Initialize and Test

# Initialize the system
from IPython.display import display, Markdown

rag_system = CVRAGSystem()

# Load sample CV
# rag_system.load_cv(halimah_CV.pdf)
rag_system.load_cv("halimah_CV.pdf")


# Test the system
display(Markdown("## üöÄ System Initialized Successfully!"))
display(Markdown("### Try these example questions:"))
display(Markdown("1. 'What are my skills?'"))
display(Markdown("2. 'What is my work experience?'"))
display(Markdown("3. 'What projects have I worked on?'"))
display(Markdown("4. 'What education do I have?'"))
display(Markdown("5. 'Search for Python'"))

‚úÖ CV loaded successfully!
üìä Sections found: []


## üöÄ System Initialized Successfully!

### Try these example questions:

1. 'What are my skills?'

2. 'What is my work experience?'

3. 'What projects have I worked on?'

4. 'What education do I have?'

5. 'Search for Python'

In [33]:
# Quick Test

# Quick test with sample questions
test_questions = [
    "What are my technical skills?",
    "Tell me about my work experience",
    "What education do I have?",
    "What projects have I completed?"
]

for question in test_questions:
    display(Markdown(f"### Testing: {question}"))
    rag_system.qa_system.display_answer(question)
    print("\n" + "="*60 + "\n")

### Testing: What are my technical skills?

### ‚ùì Question: What are my technical skills?

---

## üìä CV SUMMARY

**üìà Statistics:** 0 sections, 0 total items





### Testing: Tell me about my work experience

### ‚ùì Question: Tell me about my work experience

---

## üìä CV SUMMARY

**üìà Statistics:** 0 sections, 0 total items





### Testing: What education do I have?

### ‚ùì Question: What education do I have?

---

## üìä CV SUMMARY

**üìà Statistics:** 0 sections, 0 total items





### Testing: What projects have I completed?

### ‚ùì Question: What projects have I completed?

---

## üìä CV SUMMARY

**üìà Statistics:** 0 sections, 0 total items





In [None]:
# Start interactive mode
# Start interactive mode
rag_system.interactive_mode()

# üéØ CV RAG SYSTEM

### Interactive Analysis Mode

---


Options:
1. Ask a question about CV
2. Search for keywords
3. View CV summary
4. View parsed sections
5. Exit


In [None]:
# Load CV

def load_your_cv():
    """
    Load your own CV into the system
    Options:
    1. Paste text directly
    2. Load from file
    3. Use the sample CV
    """
    print("üìÅ Load Your CV")
    print("1. Paste CV text directly")
    print("2. Load from .txt file")
    print("3. Use sample CV")
    
    choice = input("\nEnter choice (1-3): ").strip()
    
    cv_text = ""
    
    if choice == "1":
        print("\nüìù Paste your CV text below (press Ctrl+D when finished):")
        lines = []
        try:
            while True:
                line = input()
                lines.append(line)
        except EOFError:
            pass
        cv_text = "\n".join(lines)
        
    elif choice == "2":
        filepath = input("Enter file path: ").strip()
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                cv_text = f.read()
        except Exception as e:
            print(f"‚ùå Error reading file: {e}")
            return None
            
    elif choice == "3":
        cv_text = SAMPLE_CV
        print("‚úÖ Using sample CV")
    
    else:
        print("‚ùå Invalid choice")
        return None
    
    if cv_text:
        rag_system.load_cv(cv_text)
        return cv_text
    
    return None

# Uncomment to use:
your_cv = load_your_cv()

In [12]:
# Export Function

def export_cv_analysis():
    """Export CV analysis to file"""
    if not rag_system.parser.sections:
        print("‚ùå No CV loaded")
        return
    
    filename = input("Enter filename (default: cv_analysis.txt): ").strip()
    if not filename:
        filename = "cv_analysis.txt"
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("="*60 + "\n")
            f.write("CV ANALYSIS REPORT\n")
            f.write("="*60 + "\n\n")
            
            f.write("PARSED SECTIONS:\n")
            f.write("="*60 + "\n")
            
            for section, content in rag_system.parser.sections.items():
                if content:
                    f.write(f"\n[{section.upper()}]\n")
                    f.write("-"*40 + "\n")
                    f.write(content + "\n")
            
            f.write("\n" + "="*60 + "\n")
            f.write("CV SUMMARY\n")
            f.write("="*60 + "\n\n")
            f.write(rag_system.qa_system.get_cv_summary())
        
        print(f"‚úÖ Analysis exported to {filename}")
        
    except Exception as e:
        print(f"‚ùå Error exporting: {e}")

# Uncomment to use:
# export_cv_analysis()

In [13]:
# Batch Question Test

def batch_question_test(questions_file: str = None):
    """Test multiple questions at once"""
    if questions_file:
        try:
            with open(questions_file, 'r') as f:
                questions = [line.strip() for line in f if line.strip()]
        except:
            questions = []
    else:
        # Default test questions
        questions = [
            "What programming languages do I know?",
            "Where did I work?",
            "What is my highest education?",
            "What cloud platforms am I certified in?",
            "What languages can I speak?",
            "What frameworks have I used?",
            "Tell me about my achievements"
        ]
    
    display(Markdown("## üìã Batch Question Test"))
    
    for i, question in enumerate(questions, 1):
        display(Markdown(f"### Q{i}: {question}"))
        rag_system.qa_system.display_answer(question)
        display(Markdown("---"))

# Uncomment to use:
# batch_question_test()

In [14]:
# Visualization

# Optional: Simple visualization of CV structure
import matplotlib.pyplot as plt

def visualize_cv_structure():
    """Visualize CV section lengths"""
    if not rag_system.parser.sections:
        print("‚ùå No CV loaded")
        return
    
    sections = []
    lengths = []
    
    for section, content in rag_system.parser.sections.items():
        if content:
            sections.append(section)
            lengths.append(len(content))
    
    if not sections:
        return
    
    plt.figure(figsize=(10, 6))
    bars = plt.barh(sections, lengths, color='skyblue')
    plt.xlabel('Text Length (characters)')
    plt.title('CV Section Sizes')
    plt.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for bar, length in zip(bars, lengths):
        plt.text(bar.get_width() + 10, bar.get_y() + bar.get_height()/2, 
                f'{length}', va='center')
    
    plt.tight_layout()
    plt.show()

# Uncomment to use:
# visualize_cv_structure()

ModuleNotFoundError: No module named 'matplotlib'