In [8]:
import spacy
import pandas as pd
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BlackLine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# Load your job descriptions
data = [
    "We are looking for a Data Scientist with expertise in Python, machine learning, and SQL.",
    "Frontend developer required with knowledge of React, JavaScript, HTML, and CSS.",
    "Experience with AWS cloud technologies, Docker, and Kubernetes is a must."
]

df = pd.DataFrame({'job_description': data})

In [11]:
def preprocess(text):
    # Remove special characters and lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

df['cleaned'] = df['job_description'].apply(preprocess)


In [12]:
nltk.download('punkt')
nltk.download('punkt_tab')

rake = Rake()

def extract_keywords_rake(text):
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

df['rake_keywords'] = df['cleaned'].apply(extract_keywords_rake)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BlackLine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\BlackLine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
vectorizer = TfidfVectorizer(max_features=10)
tfidf_matrix = vectorizer.fit_transform(df['cleaned'])
tfidf_keywords = vectorizer.get_feature_names_out()

df['tfidf_keywords'] = [tfidf_keywords for _ in range(len(df))]


In [14]:
nlp = spacy.load("en_core_web_sm")

def extract_spacy_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['spacy_entities'] = df['job_description'].apply(extract_spacy_entities)


In [15]:
df

Unnamed: 0,job_description,cleaned,rake_keywords,tfidf_keywords,spacy_entities
0,We are looking for a Data Scientist with exper...,we are looking for a data scientist with exper...,"[python machine learning, data scientist, sql,...","[and, are, aws, cloud, css, data, developer, e...","[(Data Scientist, ORG), (Python, GPE), (SQL, O..."
1,Frontend developer required with knowledge of ...,frontend developer required with knowledge of ...,"[react javascript html, frontend developer req...","[and, are, aws, cloud, css, data, developer, e...","[(React, GPE), (JavaScript, ORG), (HTML, ORG),..."
2,"Experience with AWS cloud technologies, Docker...",experience with aws cloud technologies docker ...,"[aws cloud technologies docker, must, kubernet...","[and, are, aws, cloud, css, data, developer, e...","[(AWS, ORG), (Docker, PERSON), (Kubernetes, ORG)]"


In [16]:
# Install PyTorch
%pip install torch
%pip install transformers

import torch
from transformers import pipeline

# Load a pre-trained pipeline for token classification (NER)
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

def extract_transformer_entities(text):
    entities = ner_pipeline(text)
    return [(entity['word'], entity['entity']) for entity in entities]

df['transformer_entities'] = df['job_description'].apply(extract_transformer_entities)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [19]:
def combine_keywords(row):
    combined = set(row['rake_keywords']).union(set(row['tfidf_keywords']))
    spacy_words = [entity[0] for entity in row['spacy_entities']]
    return combined.union(spacy_words)

df['final_keywords'] = df.apply(combine_keywords, axis=1)


In [22]:
df['final_keywords'][0]

{'Data Scientist',
 'Python',
 'SQL',
 'and',
 'are',
 'aws',
 'cloud',
 'css',
 'data',
 'data scientist',
 'developer',
 'experience',
 'expertise',
 'looking',
 'python machine learning',
 'sql',
 'with'}