In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [None]:
df = pd.read_json("hf://datasets/NxtGenIntern/IT_Job_Roles_Skills_Certifications_Dataset/Top_207_IT_Job_Roles_Skills_Database.json")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def remove_programming(skills):
    if isinstance(skills, list):
        return [s.lower().strip() for s in skills if "programming" not in s.lower()]
    elif isinstance(skills, str):
        skills_list = [s.lower().strip() for s in skills.split(',')]
        return ', '.join([s for s in skills_list if "programming" not in s])
    else:
        return skills  # handle other cases (e.g., NaN)

In [None]:
df['Skills'] = df['Skills'].apply(remove_programming)

In [None]:
print(df.head())
print(df.columns)

                                           Job Title  \
0                                     Admin Big Data   
1                        Ansible Operations Engineer   
2                          Artifactory Administrator   
3  Artificial intelligence / Machine Learning Eng...   
4  Artificial Intelligence / Machine Learning Leader   

                                     Job Description  \
0  Responsible for managing and overseeing big da...   
1  Focuses on automating IT processes using Ansib...   
2  Manages the Artifactory repository for build a...   
3                           No description available   
4  Leads AI/ML projects and teams, defining strat...   

                                              Skills  \
0  Hadoop, Spark, MapReduce, Data Lakes, Data War...   
1  Ansible, Linux, Automation, Cloud Platforms, C...   
2  Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...   
3                                                      
4  AI Strategy, Machine Learning, Team Managem

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
role_embeddings = np.array([model.encode(role) for role in df['Skills']])
role_clusters = KMeans(n_clusters=8).fit_predict(role_embeddings)
df['role_cluster'] = role_clusters

# 2. Create Skill-Role Mapping
skill_role_map = {
 # Programming Languages
    'python': ['data', 'backend', 'automation', 'machine learning'],
    'javascript': ['web', 'frontend', 'fullstack', 'mobile'],
    'java': ['backend', 'mobile', 'enterprise'],
    'c#': ['backend', 'game development', 'enterprise'],
    'go': ['backend', 'cloud', 'devops'],
    'rust': ['systems', 'backend', 'embedded'],
    'kotlin': ['mobile', 'backend'],
    'swift': ['mobile', 'ios'],

    # Web Technologies
    'react': ['web', 'frontend', 'mobile'],
    'angular': ['web', 'frontend', 'enterprise'],
    'vue': ['web', 'frontend'],
    'node.js': ['backend', 'fullstack', 'web'],
    'express': ['backend', 'web'],
    'django': ['backend', 'web', 'fullstack'],
    'flask': ['backend', 'web', 'microservices'],
    'spring': ['backend', 'enterprise'],

    # Mobile Development
    'react native': ['mobile', 'cross-platform'],
    'flutter': ['mobile', 'cross-platform'],
    'android': ['mobile', 'android'],
    'ios': ['mobile', 'ios'],

    # Data & AI
    'sql': ['data', 'backend', 'analytics'],
    'nosql': ['data', 'backend', 'big data'],
    'pandas': ['data', 'analytics', 'machine learning'],
    'numpy': ['data', 'machine learning', 'scientific computing'],
    'tensorflow': ['machine learning', 'deep learning'],
    'pytorch': ['machine learning', 'deep learning'],
    'spark': ['big data', 'data engineering'],
    'hadoop': ['big data', 'data engineering'],

    # DevOps & Cloud
    'docker': ['devops', 'cloud', 'backend'],
    'kubernetes': ['devops', 'cloud', 'scalability'],
    'aws': ['cloud', 'devops', 'backend'],
    'azure': ['cloud', 'devops', 'enterprise'],
    'gcp': ['cloud', 'devops', 'machine learning'],
    'terraform': ['devops', 'cloud', 'infrastructure'],
    'ansible': ['devops', 'automation'],
    'jenkins': ['devops', 'ci/cd'],

    # Other
    'graphql': ['backend', 'api', 'web'],
    'rest': ['backend', 'api', 'web'],
    'linux': ['systems', 'devops', 'backend'],
    'bash': ['systems', 'devops', 'automation'],
    'git': ['version control', 'collaboration'],
    'selenium': ['testing', 'automation'],
    'cypress': ['testing', 'frontend'],

    # Design & UX
    'figma': ['design', 'ux/ui', 'frontend'],
    'sketch': ['design', 'ux/ui'],
    'adobe xd': ['design', 'ux/ui'],

    # Game Development
    'unity': ['game development', 'ar/vr'],
    'unreal engine': ['game development', 'ar/vr'],

    # # Embedded & IoT
    # 'arduino': ['iot', 'embedded', 'hardware'],
    # 'raspberry pi': ['iot', 'embedded', 'prototyping'],

    # Blockchain
    # 'solidity': ['blockchain', 'smart contracts'],
    # 'web3': ['blockchain', 'frontend'],
}

In [None]:
print(skill_role_map)

{'python': ['data', 'backend', 'automation', 'machine learning'], 'javascript': ['web', 'frontend', 'fullstack', 'mobile'], 'java': ['backend', 'mobile', 'enterprise'], 'c#': ['backend', 'game development', 'enterprise'], 'go': ['backend', 'cloud', 'devops'], 'rust': ['systems', 'backend', 'embedded'], 'kotlin': ['mobile', 'backend'], 'swift': ['mobile', 'ios'], 'react': ['web', 'frontend', 'mobile'], 'angular': ['web', 'frontend', 'enterprise'], 'vue': ['web', 'frontend'], 'node.js': ['backend', 'fullstack', 'web'], 'express': ['backend', 'web'], 'django': ['backend', 'web', 'fullstack'], 'flask': ['backend', 'web', 'microservices'], 'spring': ['backend', 'enterprise'], 'react native': ['mobile', 'cross-platform'], 'flutter': ['mobile', 'cross-platform'], 'android': ['mobile', 'android'], 'ios': ['mobile', 'ios'], 'sql': ['data', 'backend', 'analytics'], 'nosql': ['data', 'backend', 'big data'], 'pandas': ['data', 'analytics', 'machine learning'], 'numpy': ['data', 'machine learning',

In [None]:
def get_complementary_skills(target_skill, top_n=10, cluster_weight=0.8):

    # 1. Get target skill's role clusters
    target_clusters = set(skill_role_map.get(target_skill, []))

    # 2. Prepare candidate skills with cluster info
    candidates = []
    for skill, clusters in skill_role_map.items():
        if skill == target_skill:
            continue

        # Calculate cluster relationship score
        common_clusters = target_clusters.intersection(clusters)
        cluster_score = 1 - len(common_clusters)/max(1, len(target_clusters))

        candidates.append({
            'skill': skill,
            'clusters': clusters,
            'cluster_score': cluster_score  # Higher = more diverse clusters
        })

    # 3. Get embeddings for all candidates
    candidate_skills = [c['skill'] for c in candidates]
    candidate_embeddings = model.encode(candidate_skills)
    target_embedding = model.encode(target_skill)

    # 4. Calculate semantic similarities
    similarities = cosine_similarity([target_embedding], candidate_embeddings)[0]
    for i, c in enumerate(candidates):
        c['similarity'] = similarities[i]

    # 5. Combined scoring (balanced approach)
    for c in candidates:
        c['composite_score'] = (
            (1 - cluster_weight) * c['similarity'] +  # Semantic similarity
            cluster_weight * c['cluster_score']       # Cluster diversity
        )

    # 6. Sort and return results
    results = sorted(candidates, key=lambda x: -x['composite_score'])
    return [r['skill'] for r in results[:top_n]]

In [None]:
target = "unity"
complements = get_complementary_skills(target)
print(f"Skills complementary to '{target}': {complements}")

Skills complementary to 'unity': ['linux', 'ios', 'android', 'vue', 'azure', 'react', 'swift', 'angular', 'java', 'flutter']


In [None]:
def get_cluster_skills(df):
    cluster_skills = defaultdict(set)
    for _, row in df.iterrows():
        skills = row['Skills'] if isinstance(row['Skills'], list) else row['Skills'].split(',')
        for skill in skills:
            cluster_skills[row['role_cluster']].add(skill.lower().strip())
    return {cluster: sorted(skills) for cluster, skills in cluster_skills.items()}

all_cluster_skills = get_cluster_skills(df)

In [None]:
for cluster_id, skills in all_cluster_skills.items():
    print(f"Cluster {cluster_id}: {', '.join(skills)}")

Cluster 3: .net, access control, active directory, ai architecture, ai strategy, ansible, api design, arm templates, automation, aws, aws lambda, azure, big data architecture, budgeting, chef, ci/cd, cloud architecture, cloud computing, cloud networking, cloud platforms, cloud security, cloudformation, communication, compliance, configuration management, consul, data governance, data integration, data lakes, data modeling, data science, data security, data warehousing, deep learning, deployment automation, devops, devsecops, docker, ec2, encryption, etl, firewalls, gcp, git, google cloud, hadoop, hashicorp vault, iam, identity and access management, infrastructure as code, infrastructure as code (iac), innovation, java, jenkins, kafka, kubernetes, leadership, linux, machine learning, mapreduce, mentoring, microservices, model deployment, networking, nosql, octopus deploy, openstack, packer, powershell, project management, puppet, python, rest apis, routing, ruby, s3, scalability, scrip