In [26]:
import spacy
import pandas as pd
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

In [41]:
# Load your job descriptions
data = """Degree in Computer Science, Computer Engineering, Mathematics, Physics or similar
Good knowledge of at least one of the following languages: Python, C#, Java
6/12 months of experience as a SW Developer in a multinational and modernly organised company or in consultancy activities is welcome
Good knowledge of SQL
Excellent knowledge of the Italian language and good knowledge of the English language
Interest in learning new technologies and development methodologies.
Strong problem solving orientation, with analytical skills and an aptitude for effective communication.
Curiosity and proactivity complete the profile
Knowledge of Microsoft Azure Ecosystem is appreciated
Knowledge of REST and SOAP APIs is appreciated
Knowledge of code communication (message broker) is appreciated
Knowledge of microservices architecture principles is appreciated
Knowledge of Salesforce and APEX and Visualforce languages ​​is appreciated"""

df = pd.DataFrame({'job_description': [data]})

In [42]:
def preprocess(text):
    # Remove special characters and lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

df['cleaned'] = df['job_description'].apply(preprocess)


In [43]:
rake = Rake()

def extract_keywords_rake(text):
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

df['rake_keywords'] = df['cleaned'].apply(extract_keywords_rake)


In [44]:
df

Unnamed: 0,job_description,cleaned,rake_keywords
0,"Degree in Computer Science, Computer Engineeri...",degree in computer science computer engineerin...,"[following languages python c java 612 months,..."


In [45]:
vectorizer = TfidfVectorizer(max_features=10)
tfidf_matrix = vectorizer.fit_transform(df['cleaned'])
tfidf_keywords = vectorizer.get_feature_names_out()

df['tfidf_keywords'] = [tfidf_keywords for _ in range(len(df))]


In [46]:
nlp = spacy.load("en_core_web_sm")

def extract_spacy_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['spacy_entities'] = df['job_description'].apply(extract_spacy_entities)


In [47]:
df

Unnamed: 0,job_description,cleaned,rake_keywords,tfidf_keywords,spacy_entities
0,"Degree in Computer Science, Computer Engineeri...",degree in computer science computer engineerin...,"[following languages python c java 612 months,...","[and, appreciated, good, in, is, knowledge, la...","[(Computer Science, Computer Engineering, ORG)..."


In [48]:
# Install PyTorch
%pip install torch
%pip install transformers

import torch
from transformers import pipeline

# Load a pre-trained pipeline for token classification (NER)
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

def extract_transformer_entities(text):
    entities = ner_pipeline(text)
    return [(entity['word'], entity['entity']) for entity in entities]

df['transformer_entities'] = df['job_description'].apply(extract_transformer_entities)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [49]:
def combine_keywords(row):
    combined = set(row['rake_keywords']).union(set(row['tfidf_keywords']))
    spacy_words = [entity[0] for entity in row['spacy_entities']]
    return combined.union(spacy_words)

df['final_keywords'] = df.apply(combine_keywords, axis=1)


In [50]:
df['final_keywords'][0]

{'APEX',
 'Computer Science, Computer Engineering',
 'English',
 'Italian',
 'Java\n6/12 months',
 'Knowledge of Salesforce',
 'Mathematics, Physics',
 'Microsoft Azure Ecosystem',
 'SQL\nExcellent',
 'Visualforce',
 'analytical skills',
 'and',
 'apex',
 'appreciated',
 'appreciated knowledge',
 'aptitude',
 'at least one',
 'code communication message broker',
 'computer science computer engineering mathematics physics',
 'consultancy activities',
 'degree',
 'development methodologies strong problem solving orientation',
 'effective communication curiosity',
 'english language interest',
 'experience',
 'following languages python c java 612 months',
 'good',
 'good knowledge',
 'in',
 'is',
 'italian language',
 'knowledge',
 'languages',
 'learning new technologies',
 'least one',
 'microservices architecture principles',
 'microsoft azure ecosystem',
 'modernly organised company',
 'multinational',
 'of',
 'or',
 'proactivity complete',
 'profile knowledge',
 'rest',
 'salesforce

In [52]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
def remove_stopwords(keywords):
    return [word for word in keywords if word.lower() not in stop_words]

cleaned_keywords = remove_stopwords(df['final_keywords'][0])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BlackLine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BlackLine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\BlackLine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [53]:
cleaned_keywords

['development methodologies strong problem solving orientation',
 'appreciated knowledge',
 'proactivity complete',
 'Knowledge of Salesforce',
 'soap apis',
 'consultancy activities',
 'languages',
 'microservices architecture principles',
 'Mathematics, Physics',
 'Java\n6/12 months',
 'good knowledge',
 'sql excellent knowledge',
 'analytical skills',
 'appreciated',
 'SQL\nExcellent',
 'APEX',
 'least one',
 'following languages python c java 612 months',
 'experience',
 'learning new technologies',
 'Italian',
 'similar good knowledge',
 'visualforce languages',
 'profile knowledge',
 'degree',
 'aptitude',
 'good',
 'english language interest',
 'modernly organised company',
 'apex',
 'at least one',
 'microsoft azure ecosystem',
 'knowledge',
 'English',
 'code communication message broker',
 'italian language',
 'effective communication curiosity',
 'multinational',
 'Visualforce',
 'Computer Science, Computer Engineering',
 'welcome good knowledge',
 'sw developer',
 'salesfor