## 0. Installing essential libraries

In [31]:
!pip install PyMuPDF # for reading PDFs with Python
!pip install tqdm # for progress bar
!pip install yake # for keyword extraction
!pip install sentence-transformers # for embedding models
!pip install accelerate # for quantization model loading
!pip install bitsandbytes # for quantizing models (less storage space)
!pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference!pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference
!pip install language_tool_python

Collecting language_tool_python
  Downloading language_tool_python-2.8-py3-none-any.whl (35 kB)
Installing collected packages: language_tool_python
Successfully installed language_tool_python-2.8


## 1. Data Ingestion

In [2]:
import fitz
import os
from tqdm.auto import tqdm # for progress bar

In [3]:
%%time
def extract_text_from_pdf(directory: str)-> list[dict]:
  """
  Extracts text from all PDF files in a directory.
  """
  resumes = []
  for filename in os.listdir(directory):
      if filename.endswith(".pdf"):
          file_path = os.path.join(directory, filename)
          pdf_document = fitz.open(file_path)
          text = ""
          for page_num, page in tqdm(enumerate(pdf_document)):
              text += page.get_text()
          resumes.append({"filename": filename,
                            "page_count": page_num + 1,
                            "page_char_count": len(text),
                            "page_word_count": len(text.split(" ")),
                            "page_sentence_count_raw": len(text.split(". ")),
                            "page_token_count": len(text) / 4,  # 1 token
                            "text": text})
          pdf_document.close()
  return resumes

resumes = extract_text_from_pdf('/content/')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

CPU times: user 197 ms, sys: 10.7 ms, total: 207 ms
Wall time: 287 ms


In [4]:
%%time
def read_txt_file(file_path: str)-> str:
  """
  Reads a text file and returns its contents as a string.
  """
  with open(file_path, 'r') as file:
    text = file.read()
  return text

file_path = '/content/desc.txt'
description = read_txt_file(file_path)

CPU times: user 1 ms, sys: 0 ns, total: 1 ms
Wall time: 919 µs


In [5]:
description

"Job Summary:\nWe are seeking a highly skilled Data Scientist to join our team, proficient in leveraging data-driven insights to inform business decisions and drive growth. As a Data Scientist, you will be responsible for collecting, analyzing, and interpreting large datasets, developing predictive models, and implementing machine learning algorithms to solve complex business problems.\nKey Responsibilities:\nDesign and implement data pipelines, architectures, and workflows to ingest, process, and visualize large datasets\nDevelop and deploy machine learning models using supervised, unsupervised, and deep learning techniques\nCollaborate with cross-functional teams to identify business problems, develop solutions, and measure KPIs\nCommunicate technical insights and recommendations to non-technical stakeholders through data visualization and storytelling\nStay up-to-date with industry trends, emerging technologies, and advancements in AI, ML, and data science\nRequirements:\nMaster's d

In [6]:
resumes[3]['text']

'KANDACE LOUDOR\nDATA SCIENTIST\nCONTACT\nkloudor@email.com\n(123) 456-7890\nMount Laurel, NJ\nLinkedIn\nGithub\nEDUCATION\nB.S.\nStatistics\nRutgers University\nSeptember 2011 - April 2015\nNew Brunswick, NJ\nSKILLS\nPython (NumPy, Pandas,\nScikit-learn, Keras, Flask)\nSQL (MySQL, Postgres)\nGit\nTime Series Forecasting\nProductionizing Models\nRecommendation Engines\nCustomer Segmentation\nAWS\nWORK EXPERIENCE\nData Scientist\nGrubhub\nJune 2018 - current / Princeton, NJ\nDeployed a recommendation engine to production to\nconditionally recommend other menu items based on past order\nhistory, increasing average order size by 7%\nImplemented various time series forecasting techniques to\npredict surge in orders, lowering customer wait by 10 minutes\nDesigned a model in a pilot to increase incentives for drivers\nduring peak hours, increasing driver availability by 22%\nLed a team of 3 data scientist to model the ordering process 5\nunique ways, reported results, and made recommendation

## 2. Initiating LLM:

In [16]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [36]:
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 15 GB


In [37]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.


In [None]:
!git config --global credential.helper 'store --file ~/.my-credentials'

In [38]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [39]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create quantization config for smaller model loading.
# For models that require 4-bit quantization (use this if low GPU memory is available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to(device)

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

2506172416

In [None]:
def get_model_mem_size(model: torch.nn.Module):
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 5012354048, 'model_mem_mb': 4780.15, 'model_mem_gb': 4.67}

## 3. Keyword Extraction and Matching:

**2.1 Using YAKE**

In [None]:
import yake

def extract_keywords_yake(text : str, numOfKeywords = 100,language = "en",max_ngram_size = 3,deduplication_threshold = 0.9) -> list:
  """
  Extracts keywords from a given text using YAKE.
  """
  custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
  keywords = custom_kw_extractor.extract_keywords(text)
  keywords_only = [kw[0] for kw in keywords]
  return keywords_only


In [None]:
%%time
keywords = extract_keywords_yake(description,70)

CPU times: user 689 ms, sys: 11.1 ms, total: 701 ms
Wall time: 705 ms


In [None]:
keywords

['Job Summary',
 'Data',
 'Data Scientist',
 'data science',
 'Certified Data Scientist',
 'skilled Data Scientist',
 'science',
 'Scientist',
 'Summary',
 'business problems',
 'business',
 'large datasets',
 'data visualization',
 'Certified Analytics Professional',
 'highly skilled Data',
 'learning',
 'machine learning',
 'inform business decisions',
 'Certified Data',
 'Data science frameworks',
 'experience',
 'Computer Science',
 'Job',
 'Certified',
 'complex business problems',
 'datasets',
 'skilled Data',
 'Key Responsibilities',
 'problems',
 'Certified Analytics',
 'inform business',
 'business decisions',
 'large',
 'Scientist to join',
 'machine',
 'interpreting large datasets',
 'machine learning models',
 'Develop',
 'models',
 'identify business problems',
 'insights',
 'related field',
 'leveraging data-driven insights',
 'Analytics Professional',
 'solve complex business',
 'field',
 'proficient in leveraging',
 'drive growth',
 'complex business',
 'implementing ma

**2.2 Using Tf-idf**

In [None]:
%%time
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import string
import nltk

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess text
def preprocess(text: str):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english') and token.text not in string.punctuation and not token.is_stop]
    return ' '.join(tokens)

processed_text = preprocess(description)

# Extract keywords using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([processed_text])
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray().flatten()

# Get keywords with the highest TF-IDF scores
keywords_ = {feature_names[i]: tfidf_scores[i] for i in range(len(feature_names))}
sorted_keywords = sorted(keywords_.items(), key=lambda x: x[1], reverse=True)

# Print top N keywords
top_n = 100  # Number of top keywords to display
for keyword, score in sorted_keywords[:top_n]:
    print(f"{keyword}: {score}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


datum: 0.5869391856534222
science: 0.2934695928267111
data: 0.1956463952178074
business: 0.14673479641335554
develop: 0.14673479641335554
experience: 0.14673479641335554
learn: 0.14673479641335554
scientist: 0.14673479641335554
certify: 0.0978231976089037
database: 0.0978231976089037
dataset: 0.0978231976089037
drive: 0.0978231976089037
field: 0.0978231976089037
implement: 0.0978231976089037
insight: 0.0978231976089037
large: 0.0978231976089037
machine: 0.0978231976089037
model: 0.0978231976089037
problem: 0.0978231976089037
process: 0.0978231976089037
related: 0.0978231976089037
team: 0.0978231976089037
technical: 0.0978231976089037
visualization: 0.0978231976089037
advancement: 0.04891159880445185
agile: 0.04891159880445185
ai: 0.04891159880445185
algorithm: 0.04891159880445185
analytic: 0.04891159880445185
analyze: 0.04891159880445185
architecture: 0.04891159880445185
aws: 0.04891159880445185
azure: 0.04891159880445185
base: 0.04891159880445185
bi: 0.04891159880445185
cap: 0.0489115

We have used to method to extract the keyword from the Job Descripton.
> 1. YAKE (Yet Another Keyword Extractor)
> 2. Tf-Idf (Term Frequency - Inverse Document Frequency)

YAKE focuses on the specific role, finding the most relevant keywords even if they're not common across all jobs. It's also simpler to use, making it ideal for quickly understanding the key skills and requirements.

Now, let's use LLM to do the task.

In [None]:
import re

def extract_keywords_llm(text: str) -> list:

  base_prompt = f"""Task is to given any job description you have to extract all the keywords.
  Give yourself room to think of keywords from the description before extracting.
  Don't return the thinking, only return the keywords.
  Make sure your extract all the keywords.
  Now extract all the keywords(don't miss anyone) from the following text:
  Extract keywords from a job description - Input :{text}"""
  dialogue_template = [
      {"role": "user",
      "content": base_prompt}
  ]
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)

  input_ids = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = llm_model.generate(**input_ids,
                            temperature=0.5,
                            do_sample=True,
                            max_new_tokens=512)

  output_text = tokenizer.decode(outputs[0])
  output_text = output_text.replace(prompt, '')
  return output_text.split(', ')




In [None]:
%%time
keywords = extract_keywords_llm(description)

CPU times: user 5.21 s, sys: 20.6 ms, total: 5.23 s
Wall time: 5.33 s


In [None]:
keywords

['<bos>Sure',
 'here are the keywords from the job description:\n\n- Data Scientist\n- Data-driven insights\n- Business decisions\n- Growth\n- Data collection\n- Data analysis\n- Machine learning\n- Data pipelines\n- Data architectures\n- Data workflows\n- Data visualization\n- Business problems\n- KPIs\n- Technical insights\n- Data storytelling\n- Industry trends\n- Emerging technologies\n- AI\n- ML\n- Data science\n- Programming languages\n- Data science frameworks\n- Data visualization tools\n- Cloud-based data platforms\n- Agile project management methodologies\n- Scrum\n- Kanban\n- Data modeling\n- Data warehousing\n- ETL processes\n- NoSQL databases\n- Graph databases\n- Data governance<eos>']

But the problem with a LLM is we can face too much inconsistency.

Let's use a pre-trained model.

In [13]:
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np

# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])


In [14]:
def extract_keywords_model(text: str) -> list:
  model_name = "ml6team/keyphrase-extraction-kbir-inspec"
  extractor = KeyphraseExtractionPipeline(model=model_name, device = device)
  text = text.replace("\n", " ")
  keyphrases = extractor(text)
  return keyphrases

In [17]:
%%time
keywords = extract_keywords_model(description).tolist()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

CPU times: user 3.69 s, sys: 3.85 s, total: 7.54 s
Wall time: 19.3 s


In [18]:
keywords

['Certified Analytics Professional',
 'Certified Data Scientist',
 'Data Science',
 'Data Scientist',
 'Data science',
 'Data visualization',
 'DevOps',
 'Is',
 'Kanban',
 'Kubernetes',
 'NoSQL databases',
 'PyTorch',
 'Science',
 'Scrum',
 'Statistics',
 'Tableau',
 'TensorFlow',
 'complex business problems',
 'containerization',
 'data governance',
 'data modeling',
 'data pipelines',
 'data science',
 'data visualization',
 'data warehousing',
 'deep learning',
 'graph databases',
 'machine learning',
 'predictive models',
 'storytelling',
 'version control systems']

## 4. Matching Resume

In [11]:
import re  # For regular expressions operations
from nltk.corpus import stopwords  # For removing stopwords from text data
import nltk
nltk.download('stopwords')

def clean(text):
    """
    Clean the input text by removing URLs, emails, special characters, and stop words.

    :param text: The string to be cleaned
    :return: The cleaned string
    """

    # Compile patterns for URLs and emails to speed up cleaning process
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

    # Remove URLs
    clean_text = url_pattern.sub('', text)

    # Remove emails
    clean_text = email_pattern.sub('', clean_text)

    # Remove special characters (keeping only words and whitespace)
    clean_text = re.sub(r'[^\w\s]', '', clean_text)

    # Remove stop words by filtering the split words of the text
    stop_words = set(stopwords.words('english'))
    clean_text = ' '.join(word for word in clean_text.split() if word.lower() not in stop_words)

    return clean_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
def match_resumes_to_keywords(resumes, keywords, top_k):
    ranked_resumes = []
    for resume in resumes:
        resume_keywords = extract_keywords_model(clean(resume['text']))
        score = sum(1 for keyword in keywords if keyword in resume_keywords)
        ranked_resumes.append((resume, score))
    ranked_resumes.sort(key=lambda x: x[1], reverse=True)
    return [resume for resume, score in ranked_resumes[:top_k]]

In [20]:
%%time
matched_resumes = match_resumes_to_keywords(resumes,keywords,2)

CPU times: user 6.08 s, sys: 4.42 s, total: 10.5 s
Wall time: 14.9 s


In [21]:
matched_resumes

[{'filename': 'Sanju_Resume.pdf',
  'page_count': 2,
  'page_char_count': 2741,
  'page_word_count': 314,
  'page_sentence_count_raw': 4,
  'page_token_count': 685.25,
  'text': 'Sanju Sarkar\n+91 8942800801 | sanjusarkar44@hotmail.com | linkedin.com/in/sanjusarkar | github.com/itzsanjus\nSUMMARY\nPassionate in AI/ML domain with a strong background in deep learning, computer vision, and natural language processing.\nSkilled in Python, PyTorch, and various ML libraries. Excellent problem-solving, research, and collaboration abilities.\nSeeking a challenging role to develop cutting-edge AI solutions.\nTECHNICAL SKILLS\nProgramming Languages: Python, C, SQL, R, Prolog\nIndustry Knowledge: Generative AI, Machine Learning, Deep Learning, LLM, CV, NLP\n, MLOps, OOP\nLibraries & Tools: NumPy, Pandas, PyTorch, Langchain, Scikit-learn, OpenCV, NLTK, Git, Docker, Fast API, Streamlit\nPROJECTS\nDeveloped a Chat bot for Kalyani University Notice Query Resolution\nMay 2024 – June 2024\nRAG Project\

In [24]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def match_resumes_to_keywords(resumes, keywords, top_k):
    model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    model.to(device) # requires a GPU installed
    # Create embeddings for the keywords
    keyword_embeddings = model.encode(keywords)

    ranked_resumes = []
    for resume in tqdm(resumes):
        # Create embedding for the resume text
        resume_embedding = model.encode(clean(resume['text']))
        total_score = 0

        for keyword_embedding in keyword_embeddings:
            # Calculate the cosine similarity
            score = cosine_similarity([keyword_embedding], [resume_embedding]).flatten()[0]
            total_score += score

        # Normalize the score
        normalized_score = total_score / len(keywords)

        ranked_resumes.append((resume, normalized_score))

    # Sort the resumes by the normalized cosine similarity score in descending order
    ranked_resumes.sort(key=lambda x: x[1], reverse=True)

    # Print normalized scores and return the top_k resumes
    for resume, score in ranked_resumes:
        print(f"Resume: {resume['text']}\nNormalized Score: {score:.4f}\n")

    return [resume for resume, score in ranked_resumes[:top_k]]


In [25]:
%%time
matched_resumes = match_resumes_to_keywords(resumes,keywords,2)

  0%|          | 0/6 [00:00<?, ?it/s]

Resume: TYLER RUSSELL
Data Science Intern
t.russell@email.com
(123) 456-7890
Seattle, WA
LinkedIn
Github
EDUCATION
Bachelor of Science
Informatics
University of Washington
2021 - current
Seattle, WA
SKILLS
Python
Jupyter Notebook
Pandas
Scikit-learn
Excel
SQL Server
AWS
Apache Spark
CAREER OBJECTIVE
A future-driven and methodical individual with data entry experience and a
knack for solving problems in data hackathons, seeking a data science internship
at Talus Bio. Passionate about leveraging data for innovation in biotech, I aim to
use my technical skills to contribute to bioinformatics research and support
Talus Bio's strategic goals.
WORK EXPERIENCE
Data Entry Clerk
Zillow Group
2022 - 2023
Seattle, WA
Generated and maintained an accurate property listing database for 576
properties using Excel, reducing data entry errors by 14%
Utilized SQL Server to query and retrieve specific property data for
analysis, saving the team an average of two hours per week
Assisted in data cleaning t

In [26]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def match_resumes_to_job_description(resumes, job_description, top_k):
    model = SentenceTransformer('all-MiniLM-L6-v2', device = device)
    model.to(device) # requires a GPU installed

    # Create embedding for the job description
    job_description_embedding = model.encode(job_description)

    ranked_resumes = []
    for resume in tqdm(resumes):
        # Create embedding for the resume text
        resume_embedding = model.encode(clean(resume['text']))

        # Calculate the cosine similarity
        score = cosine_similarity([job_description_embedding], [resume_embedding]).flatten()[0]

        ranked_resumes.append((resume, score))

    # Sort the resumes by the cosine similarity score in descending order
    ranked_resumes.sort(key=lambda x: x[1], reverse=True)

    # Print normalized scores and return the top_k resumes
    max_score = ranked_resumes[0][1] if ranked_resumes else 1
    for resume, score in ranked_resumes:
        normalized_score = score / max_score  # Normalize scores
        print(f"Resume: {resume['text']}\nNormalized Score: {normalized_score:.4f}\n")

    return [resume for resume, score in ranked_resumes[:top_k]]

In [None]:
%%time
matched_resumes = match_resumes_to_job_description(resumes,description,3)

100%|██████████| 6/6 [00:00<00:00, 83.53it/s]

Resume: TYLER RUSSELL
Data Science Intern
t.russell@email.com
(123) 456-7890
Seattle, WA
LinkedIn
Github
EDUCATION
Bachelor of Science
Informatics
University of Washington
2021 - current
Seattle, WA
SKILLS
Python
Jupyter Notebook
Pandas
Scikit-learn
Excel
SQL Server
AWS
Apache Spark
CAREER OBJECTIVE
A future-driven and methodical individual with data entry experience and a
knack for solving problems in data hackathons, seeking a data science internship
at Talus Bio. Passionate about leveraging data for innovation in biotech, I aim to
use my technical skills to contribute to bioinformatics research and support
Talus Bio's strategic goals.
WORK EXPERIENCE
Data Entry Clerk
Zillow Group
2022 - 2023
Seattle, WA
Generated and maintained an accurate property listing database for 576
properties using Excel, reducing data entry errors by 14%
Utilized SQL Server to query and retrieve specific property data for
analysis, saving the team an average of two hours per week
Assisted in data cleaning t




In [27]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import torch
from transformers import AutoTokenizer

def match_token_resumes_to_keywords(resumes, keywords, top_k=5, model_id = 'sentence-transformers/all-mpnet-base-v2', device='cuda'):
    model = SentenceTransformer(model_id, device=device)
    model.to(device)  # requires a GPU installed

    # Initialize the Hugging Face tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Create embeddings for the keywords
    keyword_embeddings = model.encode(keywords)

    ranked_resumes = []
    for resume in tqdm(resumes):
        # Tokenize the resume text into words using the Hugging Face tokenizer
        tokens = tokenizer.tokenize(clean(resume['text']))
        token_embeddings = model.encode(tokens)

        match_count = 0

        for keyword_embedding in keyword_embeddings:
            for token_embedding in token_embeddings:
                # Calculate the cosine similarity
                score = cosine_similarity([keyword_embedding], [token_embedding]).flatten()[0]
                if score > 0.9:
                    match_count += 1
                    break  # Stop after the first match for this keyword

        # Calculate the percentage of keywords that matched
        match_percentage = match_count / len(keywords)

        ranked_resumes.append((resume, match_percentage))

    # Sort the resumes by match percentage in descending order
    ranked_resumes = sorted(ranked_resumes, key=lambda x: x[1], reverse=True)

    # Return the top_k resumes
    return [resume for resume, match_percentage in ranked_resumes[:top_k]]

top_resumes = match_token_resumes_to_keywords(resumes, keywords, 3)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 6/6 [00:18<00:00,  3.01s/it]


In [28]:
top_resumes

[{'filename': 'dummy3.pdf',
  'page_count': 1,
  'page_char_count': 2581,
  'page_word_count': 295,
  'page_sentence_count_raw': 2,
  'page_token_count': 645.25,
  'text': "Trish Mathers\nEntry-Level Data Scientist\nInnovative and scientifically rigorous graduate with significant data science\ninternship experience to bring to the table. With a team-oriented attitude,\nI'm eager to contribute my abilities in quantitative modeling and\nexperimentation to enhance the experience of global Pinterest users.\ntmathers@email.com\n(123) 456-7890\nBellevue, WA\nLinkedIn\nWORK EXPERIENCE\nNiantic\nData Scientist Intern\nSeattle, WA | April 2022 - December 2022\nDeveloped a program in SAS that automated refinement of linear\nregression models for specific segments of a customer base that\nsaved 22 hours of labor per month.\nReceived, cleaned, and prepped data from client using SAS, SQL, and\nExcel to help data scientists build marketing mix models that resulted\nin a lift in ROI of 10 basis point

I have used different methods to find top matches with job description.
> 1. Matching every single keyword exracted from the description with resume and ranking them. But the problem here is it can't catch semantic information.
>2. Used cosine similarity to calculate semantic similarity each keywords from job description with all resumes and then ranking.
>3. Used cosine similarity to calculate similarity with resume and job description. But, that doesn't make sense as the main focus is to match with keywords.
>4. In this process, the resume is tokenized and calculated each with keyword with each tokens and giving a score if it crosses the threshold and ranking the resumes after iterating them. Best result has been found.

## 5. Summarization:

In [42]:
def prompt_formatter(context_items) -> str:
  """
  Augments query with text-based context from context_items.

  """
  # Create a base prompt with examples to help the model
  base_prompt = "Summarize the following resume and identify key sections:\n"
  base_prompt += context_items

  # Create prompt template for instruction-tuned model
  dialogue_template = [
      {"role": "user",
      "content": base_prompt}
  ]

  # Apply the chat template
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)
  return prompt

In [40]:
def summarize_resume(resume):
  text = resume['text']
  prompt = prompt_formatter(text)
  input_ids = tokenizer(prompt, return_tensors="pt").to(device)

  # Generate an output of tokens
  outputs = llm_model.generate(**input_ids,
                              temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                              do_sample=True,
                              max_new_tokens=512) # how many new tokens to generate from prompt

  # Turn the output tokens into text
  output_text = tokenizer.decode(outputs[0])
  output_text = output_text.replace(prompt, '')
  return output_text


In [41]:
def format_summary(summary_text):
    lines = summary_text.split('\n')
    html_summary = ''
    for line in lines:
        if line.startswith('*'):
            html_summary += f'<li>{line[1:].strip()}</li>'
        else:
            html_summary += f'<p>{line.strip()}</p>'
    return f'<ul>{html_summary}</ul>'

In [None]:
%%time
for resume in top_resumes:
  print('\n')
  print(f"Summary for the file {resume['filename']}")
  print(summarize_resume(resume))
  print('\n')



Summary for the file dummy3.pdf
<bos>**Summary:**

Entry-level Data Scientist with a strong track record of data science internship experience. Possesses both quantitative and qualitative skills, including programming, data manipulation, and analysis. Proven ability to develop and implement data-driven solutions to enhance user experience on a large platform like Pinterest.<eos>




Summary for the file Sanju_Resume.pdf
<bos>**Summary:**

Sanju Sarkar is a highly skilled Artificial Intelligence (AI)/Machine Learning (ML) professional with a strong background in deep learning, computer vision, and natural language processing. He possesses deep expertise in programming languages, libraries, and tools, and has completed several impressive projects.

**Key Skills:**

- Programming Languages: Python, C, SQL, R, Prolog
- Libraries & Tools: NumPy, Pandas, PyTorch, Langchain, Scikit-learn, OpenCV, NLTK, Git, Docker, Fast API, Streamlit
- Projects:
    - Developed a chat bot for Kalyani Unive

## 6. Detailed Analysis and Feedback Generation:

In [32]:
import language_tool_python

def analyze_resume(text):
    tool = language_tool_python.LanguageTool('en-US')
    text = clean(text)
    matches = tool.check(text)
    feedback = []
    for match in matches:
        feedback.append({
            "error": match.message,
            "suggestion": match.replacements,
            "context": match.context,
        })
    return feedback


In [33]:
print(analyze_resume(top_resumes[0]['text']))

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:04<00:00, 50.0MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpveuv2rhy.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /root/.cache/language_tool_python.


[{'error': 'Possible spelling mistake found.', 'suggestion': ["Mather's", 'Mather', 'Matters', 'Mothers', 'Fathers', 'Gathers', 'Mathews', 'Bathers', 'Maters', 'Matchers', 'Lathers', 'Mashers', 'Mat hers'], 'context': 'Trish Mathers EntryLevel Data Scientist Innovative sc...'}, {'error': 'Possible spelling mistake found.', 'suggestion': ['Entry Level'], 'context': 'Trish Mathers EntryLevel Data Scientist Innovative scientificall...'}, {'error': 'Possible spelling mistake found.', 'suggestion': ['team oriented'], 'context': '...ience internship experience bring table teamoriented attitude Im eager contribute abilities ...'}, {'error': 'Possible spelling mistake found.', 'suggestion': ["I'm"], 'context': '...ience bring table teamoriented attitude Im eager contribute abilities quantitative...'}, {'error': 'Possible spelling mistake found.', 'suggestion': ['Ni antic'], 'context': '...90 Bellevue WA LinkedIn WORK EXPERIENCE Niantic Data Scientist Intern Seattle WA April ...'}, {'error': 'T

## 7. Generating Highlights comparing with Keywords or Job Description

In [34]:
def highlights(resume,job_description):
  base_prompt = "Highlight areas where the resume matches or does not match the job description. Where to improve the resume? This is the Resume: "
  base_prompt += clean(resume)
  base_prompt += "And this is the job description:"
  base_prompt += job_description
  dialogue_template = [
      {"role": "user",
      "content": base_prompt}
  ]

  # Apply the chat template
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)

  input_ids = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = llm_model.generate(**input_ids,
                            temperature=0.5, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                            do_sample=True,
                            max_new_tokens=512) # how many new tokens to generate from prompt

  # Turn the output tokens into text
  output_text = tokenizer.decode(outputs[0])
  output_text = output_text.replace(prompt, '')
  return output_text


def highlights_keywords(resume,keywords):
  base_prompt = "Highlight areas where the resume matches or does not match the keywords extracted from job description. Where to improve the resume? This is the Resume: "
  base_prompt += clean(resume)
  base_prompt += "And this is the keywords from job description: "
  base_prompt += keywords
  dialogue_template = [
      {"role": "user",
      "content": base_prompt}
  ]

  # Apply the chat template
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)

  input_ids = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = llm_model.generate(**input_ids,
                            temperature=0.5, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                            do_sample=True,
                            max_new_tokens=512) # how many new tokens to generate from prompt

  # Turn the output tokens into text
  output_text = tokenizer.decode(outputs[0])
  output_text = output_text.replace(prompt, '')
  return output_text

In [43]:
highlights(top_resumes[1]['text'],description)

'<bos>## Areas where the resume matches the job description:\n\n* Strong background in data science and machine learning\n* Programming languages: Python, R, SQL, Julia\n* Data science frameworks: scikit-learn, TensorFlow, PyTorch\n* Data visualization tools: Tableau, Power BI, D3.js\n* Cloud-based data platforms: AWS, GCP, Azure\n\n## Areas where the resume does not match the job description:\n\n* **Missing technical skills:** The job description emphasizes programming languages and frameworks, which are not explicitly mentioned in the resume.\n* **No mention of data analysis skills:** While the resume mentions data analysis, it lacks specific examples or skills related to data analysis techniques.\n* **No mention of problem-solving skills:** The job description emphasizes the ability to solve complex business problems, but the resume does not provide specific examples or quantifiable achievements in this area.\n* **Lack of specific projects:** The resume mentions a few projects, but 

## 8. Flask App

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

https://rj6yqhhlfwc-496ff2e9c6d22116-5000-colab.googleusercontent.com/


Before running the Flask Application, make sure all the functions code have been executed.

In [47]:
from flask import Flask, request, render_template
from flask import send_from_directory
import os
import shutil


app = Flask(__name__,template_folder='/content/drive/MyDrive/templates')

@app.route('/')
def home():
  return render_template('index.html')

@app.route('/upload', methods=['POST'])
def upload_resume():
  resume_files = request.files.getlist('resumes')
  job_description = request.form['job_description']
  keywords = request.form.get('keywords')
  top_n = request.form.get('top_n')


  if not resume_files or (not job_description and not keywords) or not top_n:
    flash('Please upload at least one resume, provide a job description or keywords, and specify the number of top matched resumes.')
    return redirect(url_for('home'))
  top_n = int(top_n)
  if top_n < 1:
      flash('The number of top matched resumes must be at least 1')
      return redirect(url_for('home'))

  resume_directory = '/content/resumes'
  if os.path.exists(resume_directory):
    shutil.rmtree(resume_directory)
  os.makedirs(resume_directory)
  print("Directory created successfully")


  for resume_file in resume_files:
    resume_path = os.path.join(resume_directory, resume_file.filename)
    resume_file.save(resume_path)  # Save the resume file
    print(f"Saved resume: {resume_path}")

  resumes = extract_text_from_pdf(resume_directory)
  print(f"Extracted {len(resumes)} resumes from PDF files.")

  if keywords:
    keywords_list = keywords.split(',')
  else:
    keywords_list = extract_keywords_model(job_description)
    print(f"Extracted {len(keywords_list)} keywords from job description.")



  top_n = min(top_n, len(resumes))
  matched_resumes = match_token_resumes_to_keywords(resumes, keywords_list, top_n)

  results = []
  for resume in matched_resumes:
    output_text = summarize_resume(resume)
    print(f"Summary generated for {resume['filename']}")
    output_text = format_summary(output_text)

    if keywords:
      highlight = highlights_keywords(resume['text'],keywords)
      print(f"Highlights generated for {resume['filename']}")
      highlight = format_summary(highlight)
    else:
      highlight = highlights(resume['text'],job_description)
      print(f"Highlights generated for {resume['filename']}")
      highlight = format_summary(highlight)

    feedback = analyze_resume(resume['text'])
    results.append({
        "filename": resume['filename'],
        "summary": output_text,
        "highlights": highlight,
        "feedback": feedback
    })
  return render_template('result.html', results=results)
# Existing imports and app initialization

# Add this new route to serve the files
@app.route('/download/<filename>')
def download_file(filename):
  # Replace with the directory where resumes are saved/uploaded
  resume_directory = '/content/resumes'
  return send_from_directory(resume_directory, filename)



if __name__ == '__main__':
  app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [19/Jul/2024 05:13:24] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [19/Jul/2024 05:13:25] "[33mGET /styles.css HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [19/Jul/2024 05:13:26] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Directory created successfully
Saved resume: /content/resumes/dummy4.pdf
Saved resume: /content/resumes/dummy3.pdf
Saved resume: /content/resumes/dummy2.pdf
Saved resume: /content/resumes/dummy1.pdf


1it [00:00, 90.61it/s]
1it [00:00, 142.72it/s]
1it [00:00, 106.01it/s]
1it [00:00, 124.45it/s]

Extracted 4 resumes from PDF files.





Extracted 29 keywords from job description.


100%|██████████| 4/4 [00:11<00:00,  2.93s/it]


Summary generated for dummy3.pdf
Highlights generated for dummy3.pdf
Summary generated for dummy1.pdf
Highlights generated for dummy1.pdf


INFO:werkzeug:127.0.0.1 - - [19/Jul/2024 05:15:19] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [19/Jul/2024 05:15:20] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
