In [3]:
!pip install PyMuPDF # for reading PDFs with Python
!pip install torch # for deep learning
!pip install tqdm # for progress bar
!pip install yake # for keyword extraction
!pip install sentence-transformers # for embedding models
!pip install accelerate # for quantization model loading
!pip install bitsandbytes # for quantizing models (less storage space)
!pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference!pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference
!pip install flask # for web application
!pip install language_tool_python

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylin

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
import fitz
import os
import shutil
from tqdm.auto import tqdm # for progress bar
import yake
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
import language_tool_python
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available


def extract_text_from_pdf(directory: str)-> list[dict]:
  """
  Extracts text from all PDF files in a directory.
  """
  resumes = []
  for filename in os.listdir(directory):
      if filename.endswith(".pdf"):
          file_path = os.path.join(directory, filename)
          pdf_document = fitz.open(file_path)
          text = ""
          for page_num, page in tqdm(enumerate(pdf_document)):
              text += page.get_text()
          resumes.append({"filename": filename,
                            "page_count": page_num + 1,
                            "page_char_count": len(text),
                            "page_word_count": len(text.split(" ")),
                            "page_sentence_count_raw": len(text.split(". ")),
                            "page_token_count": len(text) / 4,  # 1 token
                            "text": text})
          pdf_document.close()
  return resumes


def extract_keywords(text : str, numOfKeywords = 100,language = "en",max_ngram_size = 3,deduplication_threshold = 0.9) -> list:
  """
  Extracts keywords from a given text using YAKE.
  """
  custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
  keywords = custom_kw_extractor.extract_keywords(text)
  keywords_only = [kw[0] for kw in keywords]
  return keywords_only

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def match_resumes_to_keywords(resumes, keywords, top_k=5):
  model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
  model.to("cuda") # requires a GPU installed
  # Create embeddings for the keywords
  keyword_embeddings = model.encode(keywords)

  ranked_resumes = []
  for resume in tqdm(resumes):
      # Create embedding for the resume text
      resume_embedding = model.encode(resume['text'])
      total_score = 0

      for keyword_embedding in keyword_embeddings:
          # Calculate the cosine similarity
          score = cosine_similarity([keyword_embedding], [resume_embedding]).flatten()[0]
          total_score += score

      # Normalize the score
      normalized_score = total_score / len(keywords)

      ranked_resumes.append((resume, normalized_score))

  # Sort the resumes by the normalized cosine similarity score in descending order
  ranked_resumes.sort(key=lambda x: x[1], reverse=True)

  return [resume for resume, score in ranked_resumes[:top_k]]

model_id = "google/gemma-2b-it"
use_quantization_config = False

# 1. Create quantization config for smaller model loading.
# For models that require 4-bit quantization (use this if low GPU memory is available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
  llm_model.to("cuda")

def prompt_formatter(context_items) -> str:
  """
  Augments query with text-based context from context_items.

  """
  # Create a base prompt with examples to help the model
  base_prompt = "Summarize the following resume:\n"
  base_prompt += context_items

  # Create prompt template for instruction-tuned model
  dialogue_template = [
      {"role": "user",
      "content": base_prompt}
  ]

  # Apply the chat template
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)
  return prompt


def summarize_resume(resume):
  text = resume['text']
  prompt = prompt_formatter(text)
  input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

  # Generate an output of tokens
  outputs = llm_model.generate(**input_ids,
                              temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                              do_sample=True,
                              max_new_tokens=512) # how many new tokens to generate from prompt

  # Turn the output tokens into text
  output_text = tokenizer.decode(outputs[0])
  output_text = output_text.replace(prompt, '')
  return output_text

def format_summary(summary_text):
    lines = summary_text.split('\n')
    html_summary = ''
    for line in lines:
        if line.startswith('*'):
            html_summary += f'<li>{line[1:].strip()}</li>'
        else:
            html_summary += f'<p>{line.strip()}</p>'
    return f'<ul>{html_summary}</ul>'

def analyze_resume(text):
  tool = language_tool_python.LanguageTool('en-US')
  matches = tool.check(text)
  feedback = []
  for match in matches:
      feedback.append({
          "error": match.message,
          "suggestion": match.replacements,
          "context": match.context,
      })
  return feedback



cuda
[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [22]:
def highlights(resume,job_description):
  base_prompt = "Highlight areas where the resume matches or does not match the job description. This is the Resume: "
  base_prompt += resume['text']
  base_prompt += "And this is the job description:"
  base_prompt += job_description
  dialogue_template = [
      {"role": "user",
      "content": base_prompt}
  ]

  # Apply the chat template
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)

  input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
  outputs = llm_model.generate(**input_ids,
                            temperature=0.5, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                            do_sample=True,
                            max_new_tokens=512) # how many new tokens to generate from prompt

  # Turn the output tokens into text
  output_text = tokenizer.decode(outputs[0])
  output_text = output_text.replace(prompt, '')
  return output_text

In [27]:
def highlights_keywords(resume,keywords):
  base_prompt = "Highlight areas where the resume matches or does not match the keywords extracted from job description. This is the Resume: "
  base_prompt += resume['text']
  base_prompt += "And this is the keywords from job description: "
  base_prompt += keywords
  dialogue_template = [
      {"role": "user",
      "content": base_prompt}
  ]

  # Apply the chat template
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)

  input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
  outputs = llm_model.generate(**input_ids,
                            temperature=0.5, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                            do_sample=True,
                            max_new_tokens=512) # how many new tokens to generate from prompt

  # Turn the output tokens into text
  output_text = tokenizer.decode(outputs[0])
  output_text = output_text.replace(prompt, '')
  return output_text

In [7]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

https://7lyzq0kltbj-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [33]:
from flask import Flask, request, render_template
from flask import send_from_directory
import os
import shutil


app = Flask(__name__,template_folder='/content/drive/MyDrive/templates')

@app.route('/')
def home():
  return render_template('index.html')

@app.route('/upload', methods=['POST'])
def upload_resume():
  resume_files = request.files.getlist('resumes')
  job_description = request.form['job_description']
  keywords = request.form.get('keywords')
  top_n = request.form.get('top_n')
  top_k = request.form.get('top_k')

  if not resume_files or (not job_description and not keywords) or not top_n:
    flash('Please upload at least one resume, provide a job description or keywords, and specify the number of top matched resumes.')
    return redirect(url_for('home'))
  top_n = int(top_n)
  if top_n < 1:
      flash('The number of top matched resumes must be at least 1')
      return redirect(url_for('home'))

  resume_directory = '/content/drive/MyDrive/resumes'
  if os.path.exists(resume_directory):
    shutil.rmtree(resume_directory)
  os.makedirs(resume_directory)
  print("Directory created successfully")


  for resume_file in resume_files:
    resume_path = os.path.join(resume_directory, resume_file.filename)
    resume_file.save(resume_path)  # Save the resume file
    print(f"Saved resume: {resume_path}")

  resumes = extract_text_from_pdf(resume_directory)
  print(f"Extracted {len(resumes)} resumes from PDF files.")

  if keywords:
    keywords_list = keywords.split(',')
  else:
    top_k = int(top_k)
    if top_k < 1:
      flash('The number of top extracted keywords be at least 1')
      return redirect(url_for('home'))

    keywords_list = extract_keywords(job_description,top_k)
    print(f"Extracted {len(keywords_list)} keywords from job description.")



  top_n = min(top_n, len(resumes))
  matched_resumes = match_resumes_to_keywords(resumes, keywords_list, top_n)

  results = []
  for resume in matched_resumes:
    output_text = summarize_resume(resume)
    print(f"Summary generated for {resume['filename']}")
    output_text = format_summary(output_text)
    if keywords:
      highlight = highlights_keywords(resume,keywords)
      print(f"Highlights generated for {resume['filename']}")
      highlight = format_summary(highlight)
    else:
      highlight = highlights(resume,job_description)
      print(f"Highlights generated for {resume['filename']}")
      highlight = format_summary(highlight)
    feedback = analyze_resume(resume['text'])
    results.append({
        "filename": resume['filename'],
        "summary": output_text,
        "highlights": highlight,
        "feedback": feedback
    })


  return render_template('result.html', results=results)
# Existing imports and app initialization

# Add this new route to serve the files
@app.route('/download/<filename>')
def download_file(filename):
  # Replace with the directory where resumes are saved/uploaded
  resume_directory = '/content/drive/MyDrive/resumes'
  return send_from_directory(resume_directory, filename)



if __name__ == '__main__':
  app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [18/Jul/2024 10:25:50] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Jul/2024 10:25:51] "[33mGET /styles.css HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [18/Jul/2024 10:25:52] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Directory created successfully
Saved resume: /content/drive/MyDrive/resumes/dummy5.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy4.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy3.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy2.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy1.pdf


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Extracted 5 resumes from PDF files.


  0%|          | 0/5 [00:00<?, ?it/s]

Summary generated for dummy1.pdf
Highlights generated for dummy1.pdf
Summary generated for dummy2.pdf
Highlights generated for dummy2.pdf


INFO:werkzeug:127.0.0.1 - - [18/Jul/2024 10:27:29] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Jul/2024 10:27:30] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [18/Jul/2024 10:28:16] "GET /download/dummy1.pdf HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Jul/2024 10:28:17] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Directory created successfully
Saved resume: /content/drive/MyDrive/resumes/dummy5.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy4.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy3.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy2.pdf
Saved resume: /content/drive/MyDrive/resumes/dummy1.pdf


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Extracted 5 resumes from PDF files.


  0%|          | 0/5 [00:00<?, ?it/s]