In [61]:
import spacy
import pandas as pd
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

In [62]:
# Load your job descriptions
data = """Degree in Computer Science, Computer Engineering, Mathematics, Physics or similar
Good knowledge of at least one of the following languages: Python, C#, Java
6/12 months of experience as a SW Developer in a multinational and modernly organised company or in consultancy activities is welcome
Good knowledge of SQL
Excellent knowledge of the Italian language and good knowledge of the English language
Interest in learning new technologies and development methodologies.
Strong problem solving orientation, with analytical skills and an aptitude for effective communication.
Curiosity and proactivity complete the profile
Knowledge of Microsoft Azure Ecosystem is appreciated
Knowledge of REST and SOAP APIs is appreciated
Knowledge of code communication (message broker) is appreciated
Knowledge of microservices architecture principles is appreciated
Knowledge of Salesforce and APEX and Visualforce languages ​​is appreciated"""

df = pd.DataFrame({'job_description': [data]})

In [63]:
# Preprocessing
def preprocess(text):
    # Replace newline characters with spaces
    text = text.replace("\n", " ")
    # Remove special characters and lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

df['cleaned'] = df['job_description'].apply(preprocess)

In [64]:
rake = Rake()

def extract_keywords_rake(text):
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

df['rake_keywords'] = df['cleaned'].apply(extract_keywords_rake)


In [65]:
vectorizer = TfidfVectorizer(max_features=10)
tfidf_matrix = vectorizer.fit_transform(df['cleaned'])
tfidf_keywords = vectorizer.get_feature_names_out()

df['tfidf_keywords'] = [tfidf_keywords for _ in range(len(df))]


In [66]:
nlp = spacy.load("en_core_web_sm")

def extract_spacy_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['spacy_entities'] = df['job_description'].apply(extract_spacy_entities)


In [67]:

from transformers import pipeline

# Load a pre-trained pipeline for token classification (NER)
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

def extract_transformer_entities(text):
    entities = ner_pipeline(text)
    return [(entity['word'], entity['entity']) for entity in entities]

df['transformer_entities'] = df['job_description'].apply(extract_transformer_entities)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [68]:
def combine_keywords(row):
    combined = set(row['rake_keywords']).union(set(row['tfidf_keywords']))
    spacy_words = [entity[0] for entity in row['spacy_entities']]
    return combined.union(spacy_words)

df['final_keywords'] = df.apply(combine_keywords, axis=1)


In [69]:
df['final_keywords'][0]

{'APEX',
 'Computer Science, Computer Engineering',
 'English',
 'Italian',
 'Java\n6/12 months',
 'Knowledge of Salesforce',
 'Mathematics, Physics',
 'Microsoft Azure Ecosystem',
 'SQL\nExcellent',
 'Visualforce',
 'analytical skills',
 'and',
 'apex',
 'appreciated',
 'appreciated knowledge',
 'aptitude',
 'at least one',
 'code communication message broker',
 'computer science computer engineering mathematics physics',
 'consultancy activities',
 'degree',
 'development methodologies strong problem solving orientation',
 'effective communication curiosity',
 'english language interest',
 'experience',
 'following languages python c java 612 months',
 'good',
 'good knowledge',
 'in',
 'is',
 'italian language',
 'knowledge',
 'languages',
 'learning new technologies',
 'least one',
 'microservices architecture principles',
 'microsoft azure ecosystem',
 'modernly organised company',
 'multinational',
 'of',
 'or',
 'proactivity complete',
 'profile knowledge',
 'rest',
 'salesforce

In [71]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def refine_keywords(keywords):
    refined = []
    for keyword in keywords:
        # Remove newline remnants and split phrases into meaningful chunks
        keyword = re.sub(r'\s+', ' ', keyword.strip())
        words = keyword.split()
        meaningful_words = [word for word in words if word.lower() not in stop_words]
        refined.append(' '.join(meaningful_words))
    return list(set(refined))

df['refined_keywords'] = df['final_keywords'].apply(refine_keywords)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BlackLine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
cleaned_keywords = df['refined_keywords'][0]

In [75]:
skills_list = [
    # Programming Languages
    "javascript", "typescript", "python", "java", "c++", "c#", "ruby", "php", "swift", 
    "kotlin", "go", "r", "sql", "bash", "html", "css", "sass", "less",

    # Front-End Frameworks and Libraries
    "react", "vue", "angular", "svelte", "jquery", "bootstrap", "tailwindcss", "foundation",
    "material-ui", "semantic-ui", "next.js", "nuxt.js", "three.js", "d3.js",

    # Back-End Frameworks and Libraries
    "node.js", "express", "django", "flask", "ruby on rails", "spring", "dotnet", "laravel",
    "fastapi", "koa", "graphql", "nestjs", "phoenix", "gin",

    # State Management
    "redux", "mobx", "vuex", "recoil", "zustand", "pinia",

    # Testing Frameworks
    "jest", "mocha", "chai", "jasmine", "cypress", "puppeteer", "playwright", "karma", 
    "enzyme", "pytest", "unittest", "selenium",

    # Version Control and Collaboration
    "git", "github", "gitlab", "bitbucket", "svn", "mercurial",

    # Build Tools and Package Managers
    "webpack", "parcel", "rollup", "gulp", "grunt", "vite", "npm", "yarn", "pnpm",

    # Cloud and DevOps Tools
    "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "circleci", "travisci",
    "ansible", "terraform", "chef", "puppet", "vagrant",

    # Databases
    "mysql", "postgresql", "mongodb", "redis", "sqlite", "cassandra", "firebase",
    "elasticsearch", "dynamodb", "couchdb",

    # APIs and Protocols
    "rest", "graphql", "grpc", "websockets", "oauth", "jwt", "openapi", "json", "xml",

    # UI/UX Design Tools
    "figma", "adobe xd", "sketch", "invision", "zeplin", "balsamiq", "axure",

    # Mobile Development
    "react native", "flutter", "ionic", "swift", "kotlin", "objective-c", "xamarin",

    # Game Development
    "unity", "unreal engine", "godot", "cocos2d", "phaser", "panda3d",

    # Content Management Systems
    "wordpress", "drupal", "joomla", "shopify", "magento",

    # Other Tools and Platforms
    "eslint", "prettier", "babel", "postman", "swagger", "jira", "asana", "trello", 
    "notion", "monday.com", "visual studio code", "intellij idea", "pycharm", "eclipse", 
    "atom", "sublime text",

    # Machine Learning and AI (Optional for Software Devs)
    "tensorflow", "keras", "pytorch", "scikit-learn", "pandas", "numpy", "opencv", "nltk",

    # General Development Concepts
    "agile", "scrum", "kanban", "tdd", "bdd", "design patterns", "microservices",
    "monorepo", "modular architecture", "performance optimization", "web accessibility",
    "responsive design", "seo", "cross-browser compatibility",

    # Web Servers
    "nginx", "apache", "iis", "caddy",

    # Security Skills
    "ssl", "tls", "https", "encryption", "firewalls", "penetration testing", "sso",
    "sast", "dast", "owasp",

    # Observability and Monitoring
    "prometheus", "grafana", "splunk", "new relic", "datadog", "elastic stack", 
    "logstash", "kibana",

    # Collaboration and Communication Tools
    "slack", "microsoft teams", "zoom", "google meet", "discord"
]


In [76]:
# Clean and filter
def filter_keywords(keywords):
    # Remove duplicates and stop words
    keywords = {word.lower() for word in keywords if word.isalpha()}
    stop_words = set(stopwords.words('english'))
    keywords = keywords - stop_words
    # Match with predefined skills
    return keywords.intersection(skills_list)

filtered_keywords = filter_keywords(cleaned_keywords)

In [77]:
filtered_keywords

{'rest'}