# Load and Clean data

In [1]:
import pandas as pd

postings = pd.read_csv('drive/MyDrive/fun_data/linkedin/postings_cleaned_w_skills.csv')
#postings = pd.read_csv('drive/MyDrive/fun_data/linkedin/postings_cleaned.csv')
#postings.drop(columns=['Unnamed: 0'], inplace=True)
postings.head()

Unnamed: 0,job_id,company_name,title,description,pay_period,location,company_id,views,formatted_work_type,applies,...,level_lead,level_associate,level_manager,level_director,level_principal,level_vice_president,level_chief,level_intern,level_junior,extracted_skills
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,HOURLY,"Princeton, NJ",2774458.0,20.0,Full-time,2.0,...,False,False,False,False,False,False,False,False,False,"['Microsoft Office', 'Adobe']"
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",HOURLY,"Fort Collins, CO",,1.0,Full-time,,...,False,False,False,False,False,False,False,False,False,[]
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,YEARLY,"Cincinnati, OH",64896719.0,8.0,Full-time,,...,False,False,True,False,False,False,False,False,False,[]
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,YEARLY,"New Hyde Park, NY",766262.0,16.0,Full-time,,...,False,True,False,False,False,False,False,False,False,[]
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,YEARLY,"Burlington, IA",,3.0,Full-time,,...,False,False,False,False,False,False,False,False,False,[]


In [3]:
postings.columns

Index(['job_id', 'company_name', 'title', 'description', 'pay_period',
       'location', 'company_id', 'views', 'formatted_work_type', 'applies',
       'remote_allowed', 'application_type', 'formatted_experience_level',
       'skills_desc', 'posting_domain', 'sponsored', 'currency',
       'compensation_type', 'zip_code', 'usps_default_state', 'state',
       'usps_default_city', 'metro_area', 'corrected_salary',
       'original_listed_time_ts', 'title_normalized', 'level_senior',
       'level_lead', 'level_associate', 'level_manager', 'level_director',
       'level_principal', 'level_vice_president', 'level_chief',
       'level_intern', 'level_junior', 'extracted_skills'],
      dtype='object')

# Generate embeddings from job description text


In [5]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# texts = list of str
embeddings = model.encode(postings['description'], show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/3370 [00:00<?, ?it/s]

In [7]:
#np.save("drive/MyDrive/data science/models/job_description_embeddings.npy", embeddings)

In [3]:
import numpy as np
embeddings = np.load("drive/MyDrive/data science/models/job_description_embeddings.npy")

In [7]:
import spacy
nlp = spacy.load("drive/MyDrive/data science/models/ner_skill_model")

In [14]:
"""
recommender.py
--------------
Implements the hybrid job recommendation logic.
"""

import numpy as np
import pandas as pd
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer



def extract_skills(nlp, text):
    """
    Run the NER model on text and extract entities labeled as 'SKILL'.
    """
    doc = nlp(text)
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return list(set(skills))


def compute_skills_similarity(user_skills: list[str], job_skills: list[list[str]]) -> np.ndarray:
    """
    Compute Jaccard similarity between a user's skills and all jobs.
    """
    mlb = MultiLabelBinarizer()
    all_skills = user_skills + [s for sublist in job_skills for s in sublist]
    mlb.fit([[s] for s in set(all_skills)])

    user_bin = mlb.transform([user_skills])
    jobs_bin = mlb.transform(job_skills)

    return np.array([
        jaccard_score(user_bin[0], jb, average="binary") for jb in jobs_bin
    ])

def compute_description_similarity(user_vec: np.ndarray, job_vecs: np.ndarray) -> np.ndarray:
    """
    Compute cosine similarity between user embedding and job embeddings.
    """
    return util.cos_sim(user_vec, job_vecs)[0].cpu().numpy()


def hybrid_recommendations(df, user_vec, job_vecs, user_skills, weight_cosine=0.7, weight_jaccard=0.3):
    """
    Combine cosine and Jaccard similarities into a final ranking.
    """
    description_similarity = compute_description_similarity(user_vec, job_vecs)
    skills_similarity = compute_skills_similarity(user_skills, df["extracted_skills"].tolist())

    score = weight_cosine * description_similarity + weight_jaccard * skills_similarity
    df["match_score"] = score
    return df.sort_values(by="match_score", ascending=False).head(10)


In [8]:
user_job_skills = """
python, data analysis, pandas, R, bash, linux
"""

In [16]:
user_background = """
Marketing Science Company - Market Research Analyst Intern
April 2019 - March 2020 Pittsburgh, PA
● Received, cleaned, and prepped data from our client using SAS, SQL, and Excel to help data
scientists build marketing mix models that resulted in a lift in ROI of 6 basis points
● Developed a program in SAS that automated refinement of linear regression models for
specific segments of a customer base that saves 25 hours of labor each month
Projects
Fantasy Football Modeling
● Wanted to stop losing at fantasy football so I aggregated and prepped 5 years of NFL
fantasy football projection data from 6 independent sources into a MySQL database
● Built a random forest model in SAS that combined the disparate sources into one
projection that outperformed the mean absolute error of the next best projection by 18%
Movie Recommendation Engine
● Aggregated data from imdb and rotten tomatoes and used k-nearest-neighbors in SAS to
build a better movie recommendation system for my snobby tastes
● I save an average of 18 minutes on movie selection relative to my previous methodology
"""

In [15]:
# if just provide skills list
user_skills_embedding = model.encode([user_job_skills], show_progress_bar=True, convert_to_numpy=True)
user_skills_lst = [s.strip() for s in user_job_skills.split(",") if len(s) > 1]

results = hybrid_recommendations(postings, user_skills_embedding, embeddings, user_skills_lst)
results[['company_name', 'title', 'location', 'extracted_skills', 'match_score']]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,company_name,title,description,location,extracted_skills,match_score
60765,Tata Consultancy Services,Developer,Skill: Senior Python/Pyspark Developer\n\nMini...,"Boston, MA","['Python', 'Pyspark', 'NumPy', 'Java', 'PySpar...",0.367171
51759,Diverse Lynx,Python Developer,"Position= Digital : PythonLocation = Boston, M...","Boston, MA","['Python', 'Pyspark', 'NumPy', 'Java', 'PySpar...",0.363824
45090,TekOne IT Services Pvt. Ltd.,Bioinformatics Engineer,Job Title: Bioinformatics Data Curation Engine...,United States,"['Python', 'python', 'statistics']",0.360658
93348,Radiansys Inc.,Python Developer,"Role : Python DeveloperLocation – San Jose ,CA...","San Jose, CA","['Python', 'python', 'Spark']",0.349543
102350,Amtex Systems Inc.,Cognos Developer,"Need Experience in advanced reporting, analyti...","Philadelphia, PA","['Python', 'data analysis']",0.3492
21012,Radley James,Data Scientist - Equity Trading,A small prop shop with around 100 or so people...,New York City Metropolitan Area,"['Python', 'Pandas']",0.3404
61104,"Tanisha Systems, Inc",Core Python Developer,Core Python DeveloperRemote\nSkills\nPrimary S...,United States,"['Python', 'Pyspark', 'SQL']",0.339685
54806,TekJobs,Senior Data Engineer,"Responsibilities:\n- Building, optimizing and ...",United States,"['Python', 'Azure', 'Docker', 'Java', 'python'...",0.33539
23605,Akkodis,Data Scientist,Data Science and Visualization Specialist Cont...,"Austin, Texas Metropolitan Area","['Python', 'Tableau', 'Pyspark', 'Pandas', 'Sp...",0.332215
72116,Tata Consultancy Services,Developer,Min 3+ year of hands of experience of working ...,"Malvern, PA","['Python', 'Pyspark', 'Docker', 'PySpark', 'SQ...",0.330979


In [17]:
# if provide a background summary, e.g. resume contents
user_background_embedding = model.encode([user_background], show_progress_bar=True, convert_to_numpy=True)
user_background_skill_lst = extract_skills(nlp, user_background)

results = hybrid_recommendations(postings, user_skills_embedding, embeddings, user_skills_lst)
results[['company_name', 'title', 'location', 'extracted_skills', 'match_score']]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,company_name,title,description,location,extracted_skills,match_score
60765,Tata Consultancy Services,Developer,Skill: Senior Python/Pyspark Developer\n\nMini...,"Boston, MA","['Python', 'Pyspark', 'NumPy', 'Java', 'PySpar...",0.367171
51759,Diverse Lynx,Python Developer,"Position= Digital : PythonLocation = Boston, M...","Boston, MA","['Python', 'Pyspark', 'NumPy', 'Java', 'PySpar...",0.363824
45090,TekOne IT Services Pvt. Ltd.,Bioinformatics Engineer,Job Title: Bioinformatics Data Curation Engine...,United States,"['Python', 'python', 'statistics']",0.360658
93348,Radiansys Inc.,Python Developer,"Role : Python DeveloperLocation – San Jose ,CA...","San Jose, CA","['Python', 'python', 'Spark']",0.349543
102350,Amtex Systems Inc.,Cognos Developer,"Need Experience in advanced reporting, analyti...","Philadelphia, PA","['Python', 'data analysis']",0.3492
21012,Radley James,Data Scientist - Equity Trading,A small prop shop with around 100 or so people...,New York City Metropolitan Area,"['Python', 'Pandas']",0.3404
61104,"Tanisha Systems, Inc",Core Python Developer,Core Python DeveloperRemote\nSkills\nPrimary S...,United States,"['Python', 'Pyspark', 'SQL']",0.339685
54806,TekJobs,Senior Data Engineer,"Responsibilities:\n- Building, optimizing and ...",United States,"['Python', 'Azure', 'Docker', 'Java', 'python'...",0.33539
23605,Akkodis,Data Scientist,Data Science and Visualization Specialist Cont...,"Austin, Texas Metropolitan Area","['Python', 'Tableau', 'Pyspark', 'Pandas', 'Sp...",0.332215
72116,Tata Consultancy Services,Developer,Min 3+ year of hands of experience of working ...,"Malvern, PA","['Python', 'Pyspark', 'Docker', 'PySpark', 'SQ...",0.330979


In [None]:
"""
streamlit_app.py
----------------
Streamlit interface for Skill-Aware Job Recommender.
"""

import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from src.embedding_utils import generate_embeddings
from src.recommender import hybrid_recommendations

# --- Load resources ---
@st.cache_resource
def load_models():
    embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    job_df = pd.read_csv("data/processed/jobs_with_skills.csv")
    job_vecs = np.load("models/job_embeddings.npy")
    return embed_model, job_df, job_vecs


embed_model, job_df, job_vecs = load_models()

# --- App layout ---
st.title("💼 Skill-Aware Job Recommender")
st.write("Get personalized job recommendations based on your résumé or skills!")

user_input = st.text_area("Paste your résumé or skill list:")
if st.button("Find Jobs") and user_input.strip():
    user_vec = generate_embeddings(embed_model, [user_input])
    user_skills = [s.strip() for s in user_input.split(",") if len(s) > 1]

    results = hybrid_recommendations(job_df, user_vec, job_vecs, user_skills)

    st.subheader("Top Recommendations")
    for _, row in results.iterrows():
        st.markdown(f"**{row['job_title']}** at *{row['company_name']}*")
        st.write(f"Skills: {', '.join(row['extracted_skills'][:8])}")
        st.progress(float(row['match_score']))
