In [1]:
import pandas as pd

df = pd.read_csv("jobs.csv")
# Convert skills string into list of skills
df["Key Skills List"] = df["Key Skills"].apply(lambda x: [skill.strip() for skill in x.split("|")])
from collections import Counter

def recommend_missing_skills(industry, candidate_skills, top_n=5):
    # Filter by industry
    industry_df = df[df["Industry"].str.contains(industry, case=False, na=False)]
    
    # Flatten the list of skills from postings in this industry
    all_skills = [skill for sublist in industry_df["Key Skills List"] for skill in sublist]
    skill_counts = Counter(all_skills)
    
    # Get top skills and filter out the ones candidate already has
    recommended = [skill for skill, count in skill_counts.most_common() if skill.lower() not in {s.lower() for s in candidate_skills}]
    
    return recommended[:top_n]


In [2]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

def recommend_jobs(candidate_skills, industry, top_n=5):
    candidate_set = set([skill.lower() for skill in candidate_skills])
    # Optionally filter by industry
    filtered_jobs = df[df["Industry"].str.contains(industry, case=False, na=False)]
    
    # Calculate similarity for each job
    filtered_jobs["Similarity"] = filtered_jobs["Key Skills List"].apply(
        lambda skills: jaccard_similarity(candidate_set, set(s.lower() for s in skills))
    )
    # Sort by similarity score
    sorted_jobs = filtered_jobs.sort_values(by="Similarity", ascending=False)
    return sorted_jobs.head(top_n)[["Job Title", "Key Skills", "Similarity"]].to_dict(orient="records")
