In [15]:
from dotenv import load_dotenv
import os

import numpy as np
import pandas as pd
from huggingface_hub import InferenceClient
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
# load the .env file into environment variables
load_dotenv()

HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
print("Token loaded?", HF_TOKEN is not None)

Token loaded? True


In [17]:
SKILLS = [
    "Python", "SQL", "Excel", "Tableau", "Pandas",
    "Power BI", "R", "AWS", "Git", "Spark",
    "Docker", "Linux", "REST APIs", "NumPy", "Scikit-learn"
]

In [18]:
HF_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY") 
client = InferenceClient(token=HF_TOKEN)

def embed_texts(texts):
    vecs = client.feature_extraction(texts, model=HF_MODEL)
    vecs = np.array(vecs, dtype="float32")
   # Normalising (L2 norm)
    norms = np.linalg.norm(vecs, axis=1, keepdims=True)
    norms[norms == 0.0] = 1.0
    return vecs / norms

skill_emb = embed_texts(SKILLS)

In [19]:
print("Number of skills:", len(SKILLS))
print("Embedding shape:", skill_emb.shape)

Number of skills: 15
Embedding shape: (15, 384)


In [35]:
def detect_skills(text, threshold=0.4): # lower threshold is less overlap, make slider on UI
    chunks = [c.strip() for c in text.replace("\n", ". ").split(".") if c.strip()]
    if not chunks:
        return set()
    chunk_emb = embed_texts(chunks)
    sims = cosine_similarity(chunk_emb, skill_emb)  # (n_chunks x n_skills)
    present = set()
    for j, skill in enumerate(SKILLS):
        if sims[:, j].max() >= threshold:
            present.add(skill)
    return present

In [40]:
# test_text = """
# I like programming in python
# """

# print("Detected skills:", detect_skills(test_text))

In [37]:
sims = cosine_similarity(embed_texts([test_text]), skill_emb)
for skill, score in zip(SKILLS, sims[0]):
    print(skill, score)

Python 0.78888834
SQL 0.25963736
Excel 0.26993227
Tableau 0.13012983
Pandas 0.36518708
Power BI 0.117324114
R 0.2123638
AWS 0.17676231
Git 0.22005652
Spark 0.14438607
Docker 0.16434258
Linux 0.33440065
REST APIs 0.15498613
NumPy 0.41810226
Scikit-learn 0.38536593


In [41]:
resume_text = """
I have experience in Python, Pandas, and NumPy for data analysis.
I also create reports in Excel and Tableau. Familiar with Git.
"""

job_postings = [
    {"title": "Data Analyst", "description": "Analyze data using Python, SQL, Excel. Build dashboards with Tableau."},
    {"title": "ML Engineer", "description": "Work with Scikit-learn, Spark, and AWS to deploy models. Knowledge of Git required."},
    {"title": "BI Analyst", "description": "Create dashboards in Power BI and Tableau, extract data using SQL."}
]

In [42]:
def compute_demand(posts):
    counts = {s: 0 for s in SKILLS}
    for p in posts:
        text = p["title"] + " " + p["description"]
        present = detect_skills(text)
        for s in present:
            counts[s] += 1
    return counts

demand_counts = compute_demand(job_postings)
resume_skills = detect_skills(resume_text)

print("Resume skills:", resume_skills)
print("Job demand counts:", demand_counts)

Resume skills: {'Python', 'Git', 'Pandas', 'Excel', 'Tableau', 'NumPy'}
Job demand counts: {'Python': 0, 'SQL': 0, 'Excel': 1, 'Tableau': 2, 'Pandas': 0, 'Power BI': 1, 'R': 0, 'AWS': 1, 'Git': 1, 'Spark': 0, 'Docker': 0, 'Linux': 0, 'REST APIs': 0, 'NumPy': 0, 'Scikit-learn': 1}


In [43]:
# --- Cell 8: Gap analysis table ---
def summarize(demand_counts, resume_skills, total_posts):
    rows = []
    for s in SKILLS:
        c = demand_counts.get(s, 0)
        pct = round(100 * c / max(total_posts, 1), 1)
        rows.append({
            "skill": s,
            "demand_count": c,
            "demand_pct": pct,
            "in_resume": s in resume_skills
        })
    df = pd.DataFrame(rows).sort_values(["demand_count"], ascending=False)
    return df

df = summarize(demand_counts, resume_skills, len(job_postings))
df

Unnamed: 0,skill,demand_count,demand_pct,in_resume
3,Tableau,2,66.7,True
2,Excel,1,33.3,True
5,Power BI,1,33.3,False
7,AWS,1,33.3,False
8,Git,1,33.3,True
14,Scikit-learn,1,33.3,False
0,Python,0,0.0,True
1,SQL,0,0.0,False
4,Pandas,0,0.0,True
6,R,0,0.0,False


In [44]:
# --- Cell 9: Show top missing skills ---
missing = df[(df["in_resume"] == False) & (df["demand_count"] > 0)]
missing.sort_values("demand_count", ascending=False).head(5)

Unnamed: 0,skill,demand_count,demand_pct,in_resume
5,Power BI,1,33.3,False
7,AWS,1,33.3,False
14,Scikit-learn,1,33.3,False
