In [2]:
import markdown
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
import matplotlib.pyplot as plt
import numpy as np
import io
from reportlab.lib.units import inch

# Define the content
name = "Hevardhan"
md_content = f"""
<html>
<head>
    <style>
        body {{
            font-family: Arial, sans-serif;
            color: #333;
        }}
        h1 {{
            color: #0056b3;
        }}
        h2 {{
            color: #0066cc;
        }}
        p {{
            font-size: 14px;
            line-height: 1.6;
        }}
        ul {{
            margin-left: 20px;
        }}
        li {{
            margin-bottom: 5px;
        }}
        .section-title {{
            font-weight: bold;
            margin-top: 20px;
        }}
        .chart {{
            margin-top: 20px;
            text-align: center;
        }}
    </style>
</head>
<body>
    <h1>Report Title</h1>
    <p>Name : {name}</p>
    <p>This is an example report generated with Python and ReportLab.</p>

    <h2>Section 1</h2>
    <p>Here is some sample content for section 1.</p>

    <h2>Section 2</h2>
    <p>More content goes here in section 2.</p>
    <ul>
        <li>Item 1</li>
        <li>Item 2</li>
        <li>Item 3</li>
    </ul>

    <p>Thank you for reading this report!</p>

    <div class="chart">
        <h2 class="section-title">Bar Chart</h2>
        <img src="bar_chart.png" alt="Bar Chart" />
    </div>

    <div class="chart">
        <h2 class="section-title">Pie Chart</h2>
        <img src="pie_chart.png" alt="Pie Chart" />
    </div>
</body>
</html>
"""

# Convert Markdown to HTML
html_content = markdown.markdown(md_content)

# Generate a Bar Chart
def create_bar_chart():
    categories = ['A', 'B', 'C', 'D']
    values = [10, 20, 30, 40]
    
    plt.figure(figsize=(6, 4))
    plt.bar(categories, values, color='blue')
    plt.title('Sample Bar Chart')
    plt.xlabel('Category')
    plt.ylabel('Values')
    
    # Save to a byte stream
    img_stream = io.BytesIO()
    plt.savefig(img_stream, format='png')
    img_stream.seek(0)
    plt.close()
    return img_stream

# Generate a Pie Chart
def create_pie_chart():
    labels = ['Category A', 'Category B', 'Category C']
    sizes = [15, 30, 55]
    
    plt.figure(figsize=(6, 6))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=['red', 'green', 'blue'])
    plt.title('Sample Pie Chart')
    
    # Save to a byte stream
    img_stream = io.BytesIO()
    plt.savefig(img_stream, format='png')
    img_stream.seek(0)
    plt.close()
    return img_stream

# Create a PDF document
pdf_filename = 'report_with_charts.pdf'
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)

# Add custom styles
styles = getSampleStyleSheet()
style_normal = styles['Normal']
style_heading1 = styles['Heading1']
style_heading2 = styles['Heading2']

# Add background color to the PDF by modifying the page template
def add_background(canvas, doc):
    canvas.setFillColor(colors.lightblue)
    canvas.rect(0, 0, letter[0], letter[1], fill=1)

# Create a Story
story = []

# Add background before the content starts
doc.build(story, onFirstPage=add_background, onLaterPages=add_background)

# Manually process the HTML content into styled paragraphs
lines = html_content.splitlines()

# Process content
for line in lines:
    line = line.strip()
    
    # Skip empty lines
    if not line:
        continue
    
    # Process <h1> and <h2> as headings
    if line.startswith('<h1>'):
        heading = line.replace('<h1>', '').replace('</h1>', '')
        para = Paragraph(heading, style_heading1)
        story.append(para)
        story.append(Spacer(1, 12))  # Spacer after the heading
    elif line.startswith('<h2>'):
        heading = line.replace('<h2>', '').replace('</h2>', '')
        para = Paragraph(heading, style_heading2)
        story.append(para)
        story.append(Spacer(1, 12))
    elif line.startswith('<p>'):
        paragraph = line.replace('<p>', '').replace('</p>', '')
        para = Paragraph(paragraph, style_normal)
        story.append(para)
        story.append(Spacer(1, 12))
    elif line.startswith('<ul>'):
        # Handle list items
        list_items = []
        list_started = False
        while line.startswith('<li>'):
            list_item = line.replace('<li>', '').replace('</li>', '')
            list_items.append(list_item)
            list_started = True
            line = lines.pop(0)  # Get the next line
        if list_started:
            para = Paragraph("<bullet>%s</bullet>" % ', '.join(list_items), style_normal)
            story.append(para)
            story.append(Spacer(1, 12))

# Generate and add the Bar Chart to the PDF
bar_chart_stream = create_bar_chart()
story.append(Spacer(1, 12))  # Add space before the image
story.append(Paragraph("Bar Chart:", style_heading2))

# Convert image to a format suitable for reportlab
bar_chart_image = Image(bar_chart_stream)
bar_chart_image.drawHeight = 3*inch
bar_chart_image.drawWidth = 4*inch
story.append(bar_chart_image)

# Generate and add the Pie Chart to the PDF
pie_chart_stream = create_pie_chart()
story.append(Spacer(1, 12))  # Add space before the image
story.append(Paragraph("Pie Chart:", style_heading2))

# Convert image to a format suitable for reportlab
pie_chart_image = Image(pie_chart_stream)
pie_chart_image.drawHeight = 3*inch
pie_chart_image.drawWidth = 4*inch
story.append(pie_chart_image)

# Build the PDF
doc.build(story)

print(f"PDF report created successfully as '{pdf_filename}'.")


PDF report created successfully as 'report_with_charts.pdf'.


In [13]:
# %%
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from IPython.display import display, Markdown

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# %%
excel_path = "job.csv"  # Path to your Excel file
if excel_path:
    df = pd.read_csv(excel_path)
    print(f"Excel file loaded successfully: {excel_path}")
else:
    print("Upload an Excel file")


Excel file loaded successfully: job.csv


In [4]:
df

Unnamed: 0,idx,label,description,skills
0,0,Engineer,Responsible for designing and developing softw...,"programming languages, algorithms, systems arc..."
1,1,Project Manager,"Oversees project planning, execution, and deli...","leadership, communication, risk management, pr..."
2,2,Data Scientist,"Analyzes large datasets to identify trends, pa...","statistical analysis, machine learning, data v..."
3,3,Marketing Specialist,Promotes products and services to customers. R...,"creativity, market research, digital marketing..."
4,4,HR Manager,"Manages employee relations, recruitment, and s...","interpersonal communication, human resources p..."
...,...,...,...,...
95,95,Software Developer,Develops and maintains software applications f...,"programming languages, software development to..."
96,96,Real Estate Project Manager,"Leads real estate development projects, from p...","real estate development, market analysis, zoni..."
97,97,Statistical Researcher,Conducts statistical research to support scien...,"statistical methods, research design, R, Pytho..."
98,98,Marketing Specialist - Trade Show Marketing Co...,Manages marketing activities for trade shows a...,"trade show planning, lead generation, vendor r..."


In [6]:
from langchain.schema import Document  # Import Document
# %%
# Convert rows to text format for processing
# data = [{"text": f"Job Role: {row['label']}\nDescription: {row['description']}\nRequirements: {row['skills']}"} 
#         for index, row in df.iterrows()]

data = [
    Document(page_content=f"Job Role: {row['label']}\nDescription: {row['description']}\nRequirements: {row['skills']}")
    for index, row in df.iterrows()
]

# %%
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(data)
print(f"Text split into {len(chunks)} chunks")

# %%
# Create vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)
print("Vector database created successfully")

# %%
local_model = "ally"  # Your preferred model
llm = ChatOllama(model=local_model)

# %%
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""Consider Yourself as Ally, a Assistance Chatbot, Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# %%
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# %%
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# %%
def chat_with_excel(question):
    """
    Chat with the Excel data using the RAG chain.
    """
    return display(Markdown(chain.invoke(question)))

# %%
chat_with_excel("Which job role suits me best given my skills?")


Text split into 100 chunks
Vector database created successfully


Based on the provided information, I would recommend the Data Scientist role to you. Although your skills are not explicitly mentioned as being relevant to Natural Language Processing (NLP) or Machine Learning Engineering, your educational background and required skills align well with these areas.

Your experience in statistical analysis, machine learning, data visualization, SQL, and data analytics suggests that you could be proficient in developing predictive models and algorithms to solve business problems. Additionally, your critical thinking skills and ability to work with data would likely serve you well in a Data Scientist role.

While it's worth noting that your background does not explicitly mention NLP or Machine Learning Engineering, many of the required skills for these roles are also applicable to other areas, such as Business Analytics Consulting or Data Science. However, based on the available information, I believe that Data Scientist is the most suitable fit for you.

In [10]:
chat_with_excel("I am looking for a job as an NLP Engineer at Dassault Systemes. Generate a quiz on the skills required for this role.")

Based on the provided job descriptions, here's a quiz to help you assess your skills and qualifications for an NLP (Natural Language Processing) Engineering role at Dassault Systemes:

**Section 1: Language Skills**

1. What is the primary focus of language processing in computer science?
a) Machine learning
b) Natural Language Processing (NLP)
c) Data mining
d) Text analysis

Answer: b) Natural Language Processing (NLP)

2. Which of the following programming languages is commonly used for NLP tasks?
a) Python
b) Java
c) C++
d) JavaScript

Answer: a) Python

3. What is the purpose of tokenization in NLP processing?
a) To remove punctuation and special characters from text
b) To convert text into a standard format for analysis
c) To identify keywords and phrases within text
d) To generate machine learning models

Answer: c) To identify keywords and phrases within text

**Section 2: Computational Skills**

1. What is the primary function of an NLP algorithm?
a) To train machine learning models
b) To perform data analysis
c) To generate predictions based on input data
d) To optimize system performance

Answer: c) To generate predictions based on input data

2. Which of the following is a key challenge in training deep learning models for NLP tasks?
a) Handling noisy or incomplete data
b) Managing large datasets
c) Identifying and mitigating bias
d) Adapting to new languages and dialects

Answer: d) Adapting to new languages and dialects

**Section 3: Domain Knowledge**

1. What is a common application of NLP in industries beyond computer science?
a) Finance and banking
b) Healthcare and medicine
c) Manufacturing and logistics
d) All of the above

Answer: d) All of the above

2. Which of the following domains benefits from the use of NLP techniques?
a) Social media analytics
b) Sentiment analysis for customer service
c) Language translation services
d) All of the above

Answer: d) All of the above

In [11]:
chat_with_excel("what skill i need for, Agile Project Manager")

Based on the job descriptions provided, I'd recommend the following skills to enhance your chances of becoming an Agile Project Manager:

1. **Agile Methodologies**: Familiarity with Agile frameworks such as Scrum or Kanban would be beneficial.
2. **Project Management Tools**: Proficiency in tools like Jira, Trello, Asana, or MS Project would be advantageous.
3. **Leadership and Communication Skills**: Effective leadership and communication skills are essential for managing project teams and stakeholders.
4. **Risk Management**: Experience with risk management techniques, such as identifying, assessing, and mitigating risks, would be valuable.
5. **Time Management and Organization**: The ability to prioritize tasks, manage multiple projects, and meet deadlines is crucial in an Agile environment.

Additionally, having experience with software development tools like Git, or knowledge of systems engineering principles could also be beneficial for a career as an Agile Project Manager.

In [12]:
chat_with_excel("give me python code to print hello world")

As Ally, I can help you with that.

Here's a simple Python code snippet that prints "Hello World":
```python
print("Hello World")
```
When you run this code in your Python environment, it will output:
```
Hello World
```

In [4]:
import pdfplumber
import re
import spacy

# Load spaCy's pre-trained model for NER
nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Function to extract contact info using regex
def extract_contact_info(text):
    email = re.search(r'\b[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone = re.search(r'\+?\d[\d -]{8,}\d', text)
    linkedin = re.search(r'https?://(www\.)?linkedin\.com/[^\s]+', text)
    github = re.search(r'https?://(www\.)?github\.com/[^\s]+', text)
    website = re.search(r'https?://(www\.)?[^\s]+', text)
    return {
        "email": email.group(0) if email else None,
        "phone": phone.group(0) if phone else None,
        "linkedin": linkedin.group(0) if linkedin else None,
        "github": github.group(0) if github else None,
        "website": website.group(0) if website else None
    }

# Function to extract sections based on keywords
def extract_section(text, section_keyword):
    lines = text.splitlines()
    section_content = []
    capture = False
    for line in lines:
        if section_keyword.lower() in line.lower():
            capture = True
            continue
        if capture:
            # Stop capturing at the next section or empty line
            if any(keyword in line.lower() for keyword in ["experience", "projects", "education", "achievements", "technical skills", "languages", "frameworks"]) or line.strip() == "":
                break
            section_content.append(line.strip())
    return " ".join(section_content)

# Function to extract named entities like name, organization, and locations using NER
def extract_entities(text):
    doc = nlp(text)
    entities = {
        "names": set(),
        "organizations": set(),
        "locations": set()
    }
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["names"].add(ent.text)
        elif ent.label_ == "ORG":
            entities["organizations"].add(ent.text)
        elif ent.label_ == "GPE":
            entities["locations"].add(ent.text)
    return entities

# Main parsing function
def parse_resume(file_path):
    # Extract the text from the PDF
    text = extract_text_from_pdf(file_path)
    
    # Extract contact info
    contact_info = extract_contact_info(text)
    
    # Extract sections
    education = extract_section(text, "education")
    experience = extract_section(text, "experience")
    projects = extract_section(text, "projects")
    achievements = extract_section(text, "achievements")
    technical_skills = extract_section(text, "technical skills")
    
    # Extract named entities
    entities = extract_entities(text)

    # Print out the parsed information
    print("Contact Info:", contact_info)
    print("\nNames:", entities["names"])
    print("Organizations:", entities["organizations"])
    print("Locations:", entities["locations"])
    print("\nEducation:", education)
    print("\nExperience:", experience)
    print("\nProjects:", projects)
    print("\nAchievements:", achievements)
    print("\nTechnical Skills:", technical_skills)

# Example usage
file_path = "Resume-1.pdf"
parse_resume(file_path)


Contact Info: {'email': 'hevardhan2004@gmail.com', 'phone': '+91 9384565379', 'linkedin': 'https://www.linkedin.com/in/hevardhan-saravanan-33642024a/', 'github': 'https://github.com/hevardhan', 'website': 'https://hevardhan.me/'}

Names: {'Pandas', 'Sklearn', 'Campus', 'Git Jun', 'Django', 'Bollinger Bands', 'Cloud Platform', 'IBM Watson', 'IBM Cloud', 'Vannila', 'Matplotlib', 'Long Short-Term Memory', 'Visual Studio', 'Java', 'PyTorch', 'Cloud Fundamentals', 'Hevardhan Saravanan\n+91', 'Standard Arrival Routes'}
Organizations: {'Support Vector Machine', 'Amazon ML Summer School', 'Oil', 'IBM', 'CNN', 'Sklearn, Tensorflow, Pytorch', 'Google Developer Student Club', 'Education\nSymbiosis Institute of Technology Pune', 'Artificial Intelligence and', 'Sklearn, Tensorflow', '3rd Place', 'Random Forest Regressor', 'SIT', 'SQL', 'HTML', 'HTML/CSS', '• Implemented', 'IEEE Education Society', 'TFT', 'Random Forest Classifier', 'Convolutional Neural\nNetworks', '• Working', 'Logistic Regression

In [7]:
import spacy
import pdfplumber
import docx2txt
import re
from spacy.matcher import Matcher

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file_path):
    return docx2txt.process(file_path)

# Function to extract email and phone number using regex
def extract_contact_info(text):
    email = re.findall(r'\b[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone = re.findall(r'\+?\d[\d -]{8,}\d', text)
    return email[0] if email else None, phone[0] if phone else None

# Define a comprehensive skill keywords list and pattern matcher for multi-word skills
skill_keywords = [
    "Python", "Java", "Machine Learning", "Data Analysis", "SQL",
    "Deep Learning", "Artificial Intelligence", "Natural Language Processing",
    "NLP", "SVM", "Random Forest", "KNN", "Google Colab", "Data Analytics",
    "Excel", "TensorFlow", "PyTorch", "AWS", "Azure", "Docker", "Kubernetes",
    "React", "JavaScript", "CSS", "HTML", "Git", "Flask", "Django",
    "Communication", "Leadership", "Project Management", "Cloud Computing",
    "API Development", "Data Engineering", "Big Data", "Scrum"
]

# Define patterns for Matcher to capture multi-word skills
patterns = [
    [{"LOWER": "machine"}, {"LOWER": "learning"}],
    [{"LOWER": "data"}, {"LOWER": "analysis"}],
    [{"LOWER": "deep"}, {"LOWER": "learning"}],
    [{"LOWER": "artificial"}, {"LOWER": "intelligence"}],
    [{"LOWER": "natural"}, {"LOWER": "language"}, {"LOWER": "processing"}]
]

# Add patterns to the matcher
for pattern in patterns:
    matcher.add("SKILL", [pattern])

# Main function to parse resume and extract specific details
def parse_resume(file_path):
    # Extract text from the resume based on file type
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format")

    # Process text with spaCy's NLP model
    doc = nlp(text)

    # Initialize fields
    person_name = None
    skills = set()
    experience = set()

    # Extract Name and Experience using NER
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not person_name:
            person_name = ent.text  # Assuming the first PERSON entity is the name
        elif ent.label_ == "ORG" or ent.label_ == "GPE":
            experience.add(ent.text)  # ORG represents companies, GPE for geographic locations often part of experience

    # Extract skills using keywords and Matcher patterns
    for token in doc:
        skill = token.text.lower()  # Standardize to lowercase
        if skill in map(str.lower, skill_keywords):
            skills.add(skill)

    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        skills.add(span.text.lower())  # Add multi-word skills in lowercase

    # Extract email and phone number using regex
    email, phone = extract_contact_info(text)

    # Output the extracted information
    print("Name:", person_name)
    print("Email:", email)
    print("Phone:", phone)
    print("Skills:", skills)
    print("Experience:", experience)

# Example usage
file_path = "Resume-1.pdf"  # or "path_to_resume.docx"
parse_resume(file_path)

Name: Hevardhan Saravanan
+91
Email: hevardhan2004@gmail.com
Phone: +91 9384565379
Skills: {'css', 'flask', 'machine learning', 'pytorch', 'tensorflow', 'svm', 'azure', 'java', 'sql', 'artificial intelligence', 'aws', 'deep learning', 'django', 'javascript', 'python', 'html', 'git'}
Experience: {'Support Vector Machine', 'Amazon ML Summer School', 'Flask', 'Oil', 'IBM', 'CNN', 'Sklearn, Tensorflow, Pytorch', 'Portugal', 'Google Developer Student Club', 'Education\nSymbiosis Institute of Technology Pune', 'Artificial Intelligence and', 'Sklearn, Tensorflow', '3rd Place', 'Random Forest Regressor', 'SIT', 'SQL', 'HTML', 'Porto', 'HTML/CSS', '• Implemented', 'IEEE Education Society', 'TFT', 'Node.js', 'Random Forest Classifier', 'Convolutional Neural\nNetworks', '• Working', 'Logistic Regression', 'NumPy', 'SMA', 'Temporal Fusion Transformer', 'Predictive Maintenance', 'Google Solution Challenge', 'JavaScript', '• Gained', 'CSS', 'GDSC', 'India', '• Applied'}


In [5]:
import spacy
import pdfplumber
import re
import docx2txt

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file_path):
    return docx2txt.process(file_path)

# Function to extract email and phone number using regex
def extract_contact_info(text):
    email = re.findall(r'\b[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone = re.findall(r'\+?\d[\d -]{8,}\d', text)
    return email[0] if email else None, phone[0] if phone else None

# Define skill keywords list to capture technical skills
skill_keywords = [
    "Python", "Java", "Machine Learning", "Data Analysis", "SQL", "Data Analytics", "Critical Thinking",
    "Problem Solving", "Leadership", "Communication", "Teamwork", "Risk Management", "Project Management Tools",
    "Statistical Analysis", "Data Visualization", "R", "Tableau", "Big Data", "Agile Methodologies", "Digital Marketing",
    "Marketing Software", "Customer Service", "Interpersonal Communication", "Human Resources Policies", "Recruitment",
    "Software Development Methodologies", "Coding", "Debugging", "Software Testing", "Version Control", "Software Architecture",
    "Agile", "Scrum", "Kanban", "Predictive Modeling", "Social Media Strategy", "Content Creation", "SEO", "Branding",
    "Influencer Marketing", "HR Policies", "Employee Training", "Employee Wellness", "Budget Management", "Financial Analysis",
    "Cost Control", "Strategic Planning", "Data Manipulation", "Excel", "Google Analytics", "Search Engine Rankings",
    "Keyword Research", "Backlinking", "Website Audits", "Content Optimization", "Employee Engagement", "Workplace Safety",
    "Materials Science", "Circuit Design", "Electrical Testing", "Cloud Platforms", "AWS", "Azure", "GCP", "Cloud Security",
    "Project Planning", "Construction Management", "Geotechnical Engineering", "Healthcare Industry Knowledge", "Regulatory Compliance",
    "Biotechnology", "Genetic Engineering", "Bioinformatics", "Cell Culture", "Data Analysis", "Electrical Engineering",
    "Risk Assessment", "Technology Trends", "Mechanical Design Principles", "CAD Software", "Machine Design", "Quality Control",
    "Event Planning", "Team Coordination", "Logistics Management", "Social Media Management", "Content Marketing",
    "Stakeholder Management", "Supply Chain Management", "Lean Manufacturing", "Six Sigma", "Environmental Impact Assessment",
    "Data Cleaning", "Feature Engineering", "Predictive Analytics", "Influencer Selection", "Content Collaboration",
    "Financial Regulations", "Employee Morale", "Supplier Relationship Management", "Project Timelines", "Team Collaboration",
    "IT Project Planning", "ETL Processes", "Big Data Technologies", "Hadoop", "Apache Spark", "Data Mining", "Data Preprocessing",
    "Database Management", "Coding", "Debugging", "Version Control", "Software Testing", "Marketing Strategies", "Data Science",
    "Software Design", "Problem-Solving", "Digital Advertising", "UX/UI Design", "B2B Marketing", "Lead Generation",
    "Public Relations", "Compliance", "Regulatory Policies", "Budget Management", "Business Strategy", "Strategic HR Planning",
    "Employee Retention", "Career Planning", "Mentorship", "Recruitment", "Diversity Management", "Customer Engagement"
]
# Improved function to capture sections based on headings, capturing bullet points for projects
def extract_section(text, keywords):
    section_content = []
    capture = False
    for line in text.splitlines():
        line_lower = line.lower()
        if any(keyword in line_lower for keyword in keywords):
            capture = True  # Start capturing when a section heading is found
        elif capture and (line.strip() == "" or any(keyword in line_lower for keyword in section_keywords)):
            break  # Stop capturing at the next empty line or a new section heading
        elif capture:
            section_content.append(line.strip())
    return "\n".join(section_content)

# Extract skills from text by searching for keywords in the text
def extract_skills(text):
    found_skills = set()
    for skill in skill_keywords:
        if re.search(rf"\b{skill}\b", text, re.IGNORECASE):
            found_skills.add(skill)
    return found_skills

# Define keywords for different sections in ATS-friendly resumes
section_keywords = {
    "education": ["education", "academic"],
    "experience": ["experience", "work history", "employment"],
    "projects": ["projects", "responsibilities"],
    "achievements": ["achievements", "awards", "honors"],
    "skills": ["technical skills", "skills", "expertise","programming language",'languages','framework']
}

# Main function to parse resume and extract details
def parse_resume(file_path):
    # Extract text from the resume based on file type
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format")

    # Process text with spaCy's NLP model
    doc = nlp(text)

    # Extract Name (using the first 'PERSON' entity found)
    person_name = next((ent.text for ent in doc.ents if ent.label_ == "PERSON"), None)

    # Extract email and phone number
    email, phone = extract_contact_info(text)

    # Extract different sections
    education = extract_section(text, section_keywords["education"])
    experience = extract_section(text, section_keywords["experience"])
    projects = extract_section(text, section_keywords["projects"])
    achievements = extract_section(text, section_keywords["achievements"])
    skills = extract_skills(text)

    # Output the extracted information
    print("Name:", person_name)
    print("Email:", email)
    print("Phone:", phone)
    print("Education:", education)
    print("Experience:", experience)
    print("Projects:", projects)
    print("Achievements:", achievements)
    print("Technical Skills:", skills)

# Example usage
file_path = "rohan resume.pdf"  # or "path/to/Resume.docx"
parse_resume(file_path)


Name: Rohan Ingle
Student
Email: rohaningle911@gmail.com
Phone: 9422184621
Education: rohaningle911@gmail.com
B.Tech in Artificial Intelligence and Machine Learning
Symbiosis Institute of Technology, Pune.
9422184621
08/2022 - Present, CGPA: 7.45
Pune, India
Experience: rohan-ingle.github.io/ Research Intern
Symbiosis Center for Behavioral Studies
linkedin.com/in/rohan-ingle- 05/2024 - Present,
b1b457249 Tasks
Using Machine Learning to analyze social media content.
Gathering insights on variety of social media posts.
github.com/Rohan-ingle
Helping in generating a Machine Learning model to understand sponsored posts.
Projects: Matplotlib Seaborn
Road Anomaly and Accident Detection
Sklearn Pandas Realtime detection of Accidents and Potholes.
Cloud Storage
Numpy Linux A Java-based application designed to host a storage system on any computer, complete with a client-side
interface for easy access and management.
AWS Plotly Uses AES encryption to enhance security.
Uses Hashing to store cred

In [17]:
import spacy
import pdfplumber
import docx2txt
import re
from spacy.matcher import Matcher

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file_path):
    return docx2txt.process(file_path)

# Function to extract email and phone number using regex
def extract_contact_info(text):
    email = re.findall(r'\b[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone = re.findall(r'\+?\d[\d -]{8,}\d', text)
    return email[0] if email else None, phone[0] if phone else None

# Helper function to convert string to camel case
def to_camel_case(text):
    words = text.split()
    return words[0].lower() + ''.join(word.capitalize() for word in words[1:])

# Define a comprehensive skill keywords list for various fields
skill_keywords = [
    "Python", "Java", "Machine Learning", "Data Analysis", "SQL", "Data Analytics", "Critical Thinking",
    "Problem Solving", "Leadership", "Communication", "Teamwork", "Risk Management", "Project Management Tools",
    "Statistical Analysis", "Data Visualization", "R", "Tableau", "Big Data", "Agile Methodologies", "Digital Marketing",
    "Marketing Software", "Customer Service", "Interpersonal Communication", "Human Resources Policies", "Recruitment",
    "Software Development Methodologies", "Coding", "Debugging", "Software Testing", "Version Control", "Software Architecture",
    "Agile", "Scrum", "Kanban", "Predictive Modeling", "Social Media Strategy", "Content Creation", "SEO", "Branding",
    "Influencer Marketing", "HR Policies", "Employee Training", "Employee Wellness", "Budget Management", "Financial Analysis",
    "Cost Control", "Strategic Planning", "Data Manipulation", "Excel", "Google Analytics", "Search Engine Rankings",
    "Keyword Research", "Backlinking", "Website Audits", "Content Optimization", "Employee Engagement", "Workplace Safety",
    "Materials Science", "Circuit Design", "Electrical Testing", "Cloud Platforms", "AWS", "Azure", "GCP", "Cloud Security",
    "Project Planning", "Construction Management", "Geotechnical Engineering", "Healthcare Industry Knowledge", "Regulatory Compliance",
    "Biotechnology", "Genetic Engineering", "Bioinformatics", "Cell Culture", "Data Analysis", "Electrical Engineering",
    "Risk Assessment", "Technology Trends", "Mechanical Design Principles", "CAD Software", "Machine Design", "Quality Control",
    "Event Planning", "Team Coordination", "Logistics Management", "Social Media Management", "Content Marketing",
    "Stakeholder Management", "Supply Chain Management", "Lean Manufacturing", "Six Sigma", "Environmental Impact Assessment",
    "Data Cleaning", "Feature Engineering", "Predictive Analytics", "Influencer Selection", "Content Collaboration",
    "Financial Regulations", "Employee Morale", "Supplier Relationship Management", "Project Timelines", "Team Collaboration",
    "IT Project Planning", "ETL Processes", "Big Data Technologies", "Hadoop", "Apache Spark", "Data Mining", "Data Preprocessing",
    "Database Management", "Coding", "Debugging", "Version Control", "Software Testing", "Marketing Strategies", "Data Science",
    "Software Design", "Problem-Solving", "Digital Advertising", "UX/UI Design", "B2B Marketing", "Lead Generation",
    "Public Relations", "Compliance", "Regulatory Policies", "Budget Management", "Business Strategy", "Strategic HR Planning",
    "Employee Retention", "Career Planning", "Mentorship", "Recruitment", "Diversity Management", "Customer Engagement"
]

# Define patterns for Matcher to capture multi-word skills
patterns = [
    [{"lower": "machine"}, {"lower": "learning"}],
    [{"lower": "data"}, {"lower": "analysis"}],
    [{"lower": "data"}, {"lower": "visualization"}],
    [{"lower": "predictive"}, {"lower": "modeling"}],
    [{"lower": "social"}, {"lower": "media"}, {"lower": "strategy"}],
    [{"lower": "content"}, {"lower": "creation"}],
    [{"lower": "search"}, {"lower": "engine"}, {"lower": "optimization"}],
    [{"lower": "project"}, {"lower": "management"}],
    [{"lower": "cloud"}, {"lower": "security"}],
    [{"lower": "big"}, {"lower": "data"}],
    [{"lower": "agile"}, {"lower": "methodologies"}],
    [{"lower": "data"}, {"lower": "preprocessing"}],
    [{"lower": "employee"}, {"lower": "engagement"}],
    [{"lower": "employee"}, {"lower": "training"}],
    [{"lower": "business"}, {"lower": "strategy"}],
    [{"lower": "data"}, {"lower": "mining"}],
    [{"lower": "cloud"}, {"lower": "platforms"}],
    [{"lower": "data"}, {"lower": "cleaning"}],
    [{"lower": "employee"}, {"lower": "morale"}],
    [{"lower": "customer"}, {"lower": "engagement"}],
    [{"lower": "event"}, {"lower": "planning"}],
    [{"lower": "stakeholder"}, {"lower": "management"}],
    [{"lower": "strategic"}, {"lower": "planning"}],
    [{"lower": "employee"}, {"lower": "wellness"}],
    [{"lower": "human"}, {"lower": "resources"}]
]

# Add patterns to the matcher
for pattern in patterns:
    matcher.add("SKILL", [pattern])

# Main function to parse resume and extract specific details
def parse_resume(file_path):
    # Extract text from the resume based on file type
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format")

    # Process text with spaCy's NLP model
    doc = nlp(text)

    # Initialize fields
    person_name = None
    skills = set()
    experience = set()

    # Extract Name and Experience using NER
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not person_name:
            person_name = ent.text  # Assuming the first PERSON entity is the name
        elif ent.label_ == "ORG" or ent.label_ == "GPE":
            experience.add(ent.text)  # ORG represents companies, GPE for geographic locations often part of experience

    # Extract skills using keywords and Matcher patterns
    for token in doc:
        skill = token.text.lower()  # Standardize to lowercase
        if skill in map(str.lower, skill_keywords):
            skills.add(to_camel_case(skill))  # Convert skill to camel case

    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        skills.add(to_camel_case(span.text.lower()))  # Convert multi-word skills to camel case

    # Extract email and phone number using regex
    email, phone = extract_contact_info(text)

    # Output the extracted information
    print("Name:", person_name)
    print("Email:", email)
    print("Phone:", phone)
    
    # Format the skills output in the same form as skill_keywords
    if skills:
        formatted_skills = sorted([skill.capitalize() for skill in skills])
        print("Skills:", formatted_skills)
    else:
        print("No skills matched")

    # Optionally, display experience
    if experience:
        print("Experience:", experience)
    else:
        print("No experience matched")

# Example usage
file_path = "Resume-1.pdf"  # or "path_to_resume.docx"
parse_resume(file_path)

Name: Hevardhan Saravanan
+91
Email: hevardhan2004@gmail.com
Phone: +91 9384565379
Skills: ['Aws', 'Azure', 'Cloudplatforms', 'Java', 'Machinelearning', 'Python', 'R', 'Sql']
Experience: {'Porto', 'Node.js', 'Random Forest Regressor', 'CSS', 'India', 'HTML/CSS', 'Random Forest Classifier', 'SMA', 'HTML', 'Oil', 'CNN', 'Support Vector Machine', 'GDSC', 'Sklearn, Tensorflow', 'TFT', 'Education\nSymbiosis Institute of Technology Pune', 'IEEE Education Society', 'JavaScript', 'NumPy', 'Artificial Intelligence and', '• Implemented', 'Flask', 'Sklearn, Tensorflow, Pytorch', 'IBM', 'Google Developer Student Club', '• Gained', 'Convolutional Neural\nNetworks', '3rd Place', 'Portugal', '• Working', 'SQL', 'Google Solution Challenge', 'Predictive Maintenance', 'Logistic Regression', 'SIT', 'Temporal Fusion Transformer', '• Applied', 'Amazon ML Summer School'}


In [2]:
import spacy
import pdfplumber
import re
import docx2txt

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Ensure text exists on the page
                text += page_text + "\n"
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file_path):
    return docx2txt.process(file_path)

# Function to extract email and phone number using regex
def extract_contact_info(text):
    email = re.findall(r'\b[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone = re.findall(r'\+?\d[\d -]{8,}\d', text)
    return email[0] if email else None, phone[0] if phone else None

# Improved skill extraction using noun chunks and skill keywords
def extract_skills(doc):
    found_skills = set()
    for chunk in doc.noun_chunks:
        chunk_text = chunk.text.lower()
        for skill in skill_keywords:
            if skill.lower() in chunk_text:
                found_skills.add(skill)
    return found_skills

# Improved section extraction by using patterns and looking for bullet points or phrases
def extract_section(text, keywords):
    section_content = []
    capture = False
    for line in text.splitlines():
        line_lower = line.lower()
        if any(keyword in line_lower for keyword in keywords):
            capture = True
            section_content = []  # Start fresh for each section
            continue
        elif capture and (line.strip() == "" or any(keyword in line_lower for keyword in section_keywords)):
            break
        elif capture:
            section_content.append(line.strip())
    return "\n".join(section_content)

# Extract name, organization, and other entities using refined NER patterns
def extract_entities(doc):
    entities = {
        "names": set(),
        "organizations": set()
    }
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["names"].add(ent.text)
        elif ent.label_ == "ORG":
            entities["organizations"].add(ent.text)
    # Return only the first name as the primary name
    return list(entities["names"])[0] if entities["names"] else None, entities["organizations"]

# Define keywords for different sections in ATS-friendly resumes
section_keywords = {
    "education": ["education", "academic"],
    "experience": ["experience", "work history", "employment"],
    "projects": ["projects", "responsibilities"],
    "achievements": ["achievements", "awards", "honors"],
    "skills": ["technical skills", "skills", "expertise", "programming languages", "languages", "frameworks"]
}

# Define a refined list of skill keywords to capture technical and soft skills
skill_keywords = [
    "Python", "Java", "Machine Learning", "Data Analysis", "SQL", "Data Analytics", "Critical Thinking", "Problem Solving",
    "Leadership", "Communication", "Teamwork", "Risk Management", "Project Management", "Statistical Analysis", 
    "Data Visualization", "R", "Tableau", "Big Data", "Agile", "Scrum", "Predictive Modeling", "Digital Marketing",
    "Marketing Strategy", "SEO", "Financial Analysis", "Content Marketing", "Cloud Computing", "AWS", "Azure", 
    "Machine Design", "CAD Software", "Quality Control", "UX/UI Design", "Data Science", "Software Testing", "Software Design"
]

# Main function to parse resume and extract details
def parse_resume(file_path):
    # Extract text from the resume based on file type
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format")

    # Process text with spaCy's NLP model
    doc = nlp(text)

    # Extract contact info
    email, phone = extract_contact_info(text)

    # Extract name and organizations
    person_name, organizations = extract_entities(doc)

    # Extract skills
    skills = extract_skills(doc)

    # Extract sections
    education = extract_section(text, section_keywords["education"])
    experience = extract_section(text, section_keywords["experience"])
    projects = extract_section(text, section_keywords["projects"])
    achievements = extract_section(text, section_keywords["achievements"])

    # Output the extracted information
    print("Name:", person_name)
    print("Email:", email)
    print("Phone:", phone)
    print("Organizations:", organizations)
    print("\nEducation:", education)
    print("\nExperience:", experience)
    print("\nProjects:", projects)
    print("\nAchievements:", achievements)
    print("\nTechnical Skills:", skills)

# Example usage
file_path = "Resume-1.pdf"  # or "path/to/Resume.docx"
parse_resume(file_path)


Name: Pandas
Email: hevardhan2004@gmail.com
Phone: +91 9384565379
Organizations: {'Support Vector Machine', 'Amazon ML Summer School', 'Oil', 'IBM', 'CNN', 'Sklearn, Tensorflow, Pytorch', 'Google Developer Student Club', 'Education\nSymbiosis Institute of Technology Pune', 'Artificial Intelligence and', 'Sklearn, Tensorflow', '3rd Place', 'Random Forest Regressor', 'SIT', 'SQL', 'HTML', 'HTML/CSS', '• Implemented', 'IEEE Education Society', 'TFT', 'Random Forest Classifier', 'Convolutional Neural\nNetworks', '• Working', 'Logistic Regression', 'SMA', 'Temporal Fusion Transformer', 'Predictive Maintenance', 'Google Solution Challenge', 'JavaScript', '• Gained', 'CSS', 'GDSC', '• Applied'}

Education: Symbiosis Institute of Technology Pune, India
B.Tech in Artificial Intelligence and Machine Learning Aug. 2022 – June 2026

Experience: Research Intern June 2024 – Present
Symbiosis Centre for Applied AI Pune, India
• Responsible for conducting a research on Predictive Maintenance of urban 

In [37]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load a question generation-specific T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-small-qg-prepend")
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-prepend")

def generate_quiz_question(passage):
    # Preprocess input with a clear instruction for question generation
    input_text = f"question: {passage}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate question with beam search and other parameters for better quality
    outputs = model.generate(input_ids, max_length=50, num_beams=5, temperature=1.0, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return question

# Example passage
passage = "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll."
print(generate_quiz_question(passage))


config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



Photosynthesis is the process by which green plants and other organisms use sunlight to synthesize foods with the help of chlorophyll?


In [38]:
passage = "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll."
print(generate_quiz_question(passage))

Photosynthesis is the process by which green plants and other organisms use sunlight to synthesize foods with the help of chlorophyll?


In [40]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load pre-trained T5 model and tokenizer (e.g., T5 for question generation)
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-small-qg-prepend")
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-prepend")

def generate_mcq(passage, skill="General Knowledge", difficulty="Medium"):
    """
    Generates an MCQ based on a given passage.
    
    Parameters:
    - passage: The input text for question generation
    - skill: The skill or topic (optional)
    - difficulty: Difficulty level of the question (optional)
    
    Returns:
    - A dictionary containing the question, choices, and answer.
    """
    # Step 1: Generate the question from the passage
    input_text = f"question: {passage}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate question text
    question_output = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    question = tokenizer.decode(question_output[0], skip_special_tokens=True)
    
    # Step 2: Simulate answer options (in reality, a more complex setup is needed to get diverse options)
    choices = [
        "a. Option 1 related to the topic",
        "b. Option 2 related to the topic",
        "c. Option 3 (correct answer, extracted from passage)",
        "d. Option 4 related to the topic"
    ]
    correct_answer = "c"  # This is set as a placeholder for demonstration

    # Step 3: Output the MCQ in the desired format
    mcq = {
        "Skill": skill,
        "Difficulty Level": difficulty,
        "Question": question,
        "Choices": ", ".join(choices),
        "Answer": correct_answer
    }
    return mcq

# Example passage
passage = "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll."
print(generate_mcq(passage, skill="Biology", difficulty="Simple"))


{'Skill': 'Biology', 'Difficulty Level': 'Simple', 'Question': 'Photosynthesis is the process by which green plants and other organisms use sunlight to synthesize foods with the help of chlorophyll?', 'Choices': 'a. Option 1 related to the topic, b. Option 2 related to the topic, c. Option 3 (correct answer, extracted from passage), d. Option 4 related to the topic', 'Answer': 'c'}


In [52]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import random

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-small-qg-prepend")
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-prepend")

# Load questions from the CSV file
file_path = 'Skill_MCQs_with_Answers_updated.csv'
mcq_data = pd.read_csv(file_path)

def rephrase_question(passage):
    """Generates a rephrased question from a passage using a Seq2Seq model."""
    input_text = f"question: {passage}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    question_output = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    question = tokenizer.decode(question_output[0], skip_special_tokens=True)
    return question

def run_quiz_with_seq2seq(num_questions=5):
    """Run a quiz that mixes pre-existing and Seq2Seq-rephrased questions."""
    # Randomly sample questions from the dataset
    questions = mcq_data.sample(n=num_questions).reset_index(drop=True)
    
    for i in range(num_questions):
        # Randomly decide to either use a pre-existing question or rephrase it with Seq2Seq
        use_generated_question = random.choice([True, False])
        
        # if use_generated_question:
            # Rephrase the question using Seq2Seq model
        passage = questions.loc[i, 'Question']
        question = rephrase_question(passage)
        # else:
            # Use the original question from the dataset
            # question = questions.loc[i, 'Question']
        
        # Retrieve choices and correct answer directly from the dataset
        choices = questions.loc[i, 'Choices']
        correct_answer = questions.loc[i, 'Answer'].strip().lower()
        
        # Display the question and choices
        print(f"Question {i+1}: {question}")
        print(f"Choices: {choices}")
        
        # Get the user's answer
        user_answer = input("Enter the correct option (e.g., 'a', 'b', etc.): ").strip().lower()
        
        # Check if the answer is correct
        if user_answer != correct_answer:
            raise ValueError(f"Incorrect! The correct answer was '{correct_answer}'. Quiz ended.")
        else:
            print("Correct!\n")

    print("Congratulations! You answered all questions correctly.")

# Run the quiz
run_quiz_with_seq2seq()




Question 1: Explain standard deviation?
Choices: a. Variation measure, b. Sum, c. Percentage, d. Average


ValueError: Incorrect! The correct answer was 'a'. Quiz ended.

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-small-qg-prepend")
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-prepend")

# Load your data from CSV
file_path = 'Skill_MCQs_with_Answers_updated.csv'
mcq_data = pd.read_csv(file_path)

# Prepare the data for training
def preprocess_data(data):
    """Prepares the data for training"""
    inputs = [f"question: {q}" for q in data['Question']]
    outputs = data['Answer']  # Use the answer column as the output (could be rephrased question)
    return {"input_text": inputs, "output_text": outputs}

# Preprocess data and create dataset
train_data = preprocess_data(mcq_data)
train_dataset = Dataset.from_dict(train_data)

# Tokenize the inputs and outputs
def tokenize_function(examples):
    # Tokenize the input and output pairs
    inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=128)
    outputs = tokenizer(examples["output_text"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply tokenization to the dataset
tokenized_datasets = train_dataset.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory to save the model
    evaluation_strategy="epoch",     # Evaluate after each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size per device
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5_model")
tokenizer.save_pretrained("./fine_tuned_t5_model")


RuntimeError: Failed to import transformers.models.t5.modeling_t5 because of the following error (look up to see its traceback):
Unable to convert function return value to a Python type! The signature was
	() -> handle

In [3]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader, Dataset

# Define your custom dataset
class QGDataset(Dataset):
    def __init__(self, input_texts, output_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.output_texts = output_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        output_text = self.output_texts[idx]

        # Tokenize inputs and outputs
        inputs = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True)
        outputs = self.tokenizer(output_text, max_length=self.max_length, padding="max_length", truncation=True)

        # Prepare input and target tensors
        input_ids = torch.tensor(inputs['input_ids'])
        attention_mask = torch.tensor(inputs['attention_mask'])
        labels = torch.tensor(outputs['input_ids'])

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Initialize model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-small-qg-prepend")
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-prepend")

# Example data
input_texts = ["question: What is Python's GIL?"]
output_texts = ["Explain the Global Interpreter Lock in Python."]

# Prepare dataset and dataloader
dataset = QGDataset(input_texts, output_texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=2)

# Set up optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")


RuntimeError: Failed to import transformers.models.t5.modeling_t5 because of the following error (look up to see its traceback):
Unable to convert function return value to a Python type! The signature was
	() -> handle

In [3]:
import pandas as pd
import random

# Load the CSV file
df = pd.read_csv('Skill_MCQs_with_Answers_updated.csv')  # Replace 'questions.csv' with your CSV file path

# List of skills you want to filter by
skills_list = ['{Python', 'Java', 'html']  # Replace with your actual skills

# Filter questions based on the list of skills
filtered_questions = df[df['Skill'].isin(skills_list)]

# Select 3 random questions
random_questions = filtered_questions.sample(n=3)

# Convert the selected questions to the desired format
quiz_data = [
    {
        "question": row['Question'],
        "options": row['Choices'].split(", "),  # Assumes choices are comma-separated in the CSV
        "correct_answer": row['Answer']
    }
    for _, row in random_questions.iterrows()
]

# Display the quiz data
print(quiz_data)


[{'question': "What is the use of the 'final' keyword?", 'options': ['a. To make a variable constant', 'b. Define loop', 'c. Create thread', 'd. None'], 'correct_answer': 'a'}, {'question': 'Explain the concept of JVM.', 'options': ['a. Hardware component', 'b. Memory manager', 'c. Java bytecode interpreter', 'd. None'], 'correct_answer': 'c'}, {'question': 'What is Java?', 'options': ['a. A script', 'b. Operating System', 'c. Programming Language', 'd. Database'], 'correct_answer': 'c'}]
