In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import en_core_web_sm
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle
import re
import os.path
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('../data/adzuna_top25_20260123_20_17_49.csv')

In [3]:
df.dtypes

Company            str
Position           str
Location           str
Tags               str
Description        str
URL                str
Salary_Min     float64
Salary_Max     float64
Date               str
ID               int64
dtype: object

In [4]:
nlp = en_core_web_sm.load()

In [5]:
all_text = "".join(df['Description'].astype('str'))

In [6]:
doc = nlp(all_text, disable=['parser', 'ner'])

In [7]:
words = [
    token.lemma_.lower() for token in doc
    if not token.is_stop
    and not token.is_punct
    and token.is_alpha
]

In [8]:
word_freq = Counter(words)

In [9]:
print(f"Total unique words found: {len(word_freq)}")
print("Top 50 most frequent words (Potential Stop Words): ")
df_word_freq = pd.DataFrame(word_freq.most_common(50))
df_word_freq.columns = ['Stop Words', 'Frequency']
print(df_word_freq)

Total unique words found: 3317
Top 50 most frequent words (Potential Stop Words): 
        Stop Words  Frequency
0              job        677
1      description        636
2             team        591
3             data        569
4         engineer        537
5         solution        438
6             role        423
7         security        406
8           design        405
9       experience        348
10            seek        344
11           datum        330
12           cloud        326
13         support        322
14         network        297
15          system        293
16              ai        287
17            work        280
18         develop        280
19        business        276
20            join        273
21     environment        267
22            type        260
23        platform        257
24         product        256
25         service        250
26  infrastructure        243
27          remote        243
28        software        242
29      technolog

In [10]:
# Define the custom list based on the EDA Analysis Results
custom_stop_words = [
    # Hiring / Generic
    "job", "description", "role", "seek", "position", "candidate", "ideal",
    "opportunity", "join", "client", "company", "new", "type", "remote",
    "experience", "work", "year", "skill", "require", "requirement", "include",
    "need", "strong", "ability", "knowledge", "responsible",

    # HR / Benefits
    "pay", "benefit", "salary", "range", "employee", "disability", "equal",
    "time", "base", "status",

    # Vague Verbs
    "provide", "focus", "drive", "collaborate", "support", "build", "help",
    "create", "maintain", "perform",

    # Generic Tech Context (Too broad to be useful)
    "solution", "system", "environment", "platform", "product", "service",
    "technology", "technical", "application", "industry", "high", "software",
    "engineer", "engineering", "development", "develop",

    # NEW: Recruiting & Process
    "interview", "recruiter", "prospect", "candidate", "select", "review",
    "meet", "touch", "region", "status", "fill", "join", "process", "aspect",

    # NEW: Corporate Fluff & Adjectives
    "impact", "fast", "pace", "inspire", "excite", "excited", "successful",
    "dynamic", "demanding", "challenge", "varied", "culture", "passion",
    "mission", "critical", "commercial", "good", "excellent", "solid",
    "expert", "proficiently", "minimum", "related", "specific", "wide",
    "array", "proven", "track", "record", "strong", "deep", "outcome",
    "real", "thinker", "acuman", "acumen", "important", "fundamental",

    # NEW: Benefits & Legal
    "insurance", "medical", "life", "retirement", "tax", "free", "saving",
    "plan", "healthcare", "incentive", "compensation", "eligible",
    "discretionary", "bonus", "bachelor", "degree", "discipline", "stem",
    "accordance", "applicable", "law", "legal", "compliance", "regulatory",
    "addition", "program", "fund", "funding", "settlement", "investor",

    # NEW: Generic Verbs/Nouns
    "look", "know", "prove", "manage", "solve", "participate", "align",
    "increase", "maximize", "iterate", "define", "spec", "change", "flex",
    "course", "pre", "gen", "desk", "partner", "team", "task", "problem",
    "dissect", "return", "efficiency", "research", "analysis", "power",

    # Generic Nouns
    "skill", "skills", "talent", "level", "following", "access",
    "aspect", "impact", "prospect", "outcome", "change", "course",
    "desk", "gen", "spec", "pre", "market", "interview", "seniority",

    "https", "http", "com", "www", "career", "careers", "apply",
    "website", "location", "locations", "email", "contact",
    "toast", "toasttab", "restaurant"
]

# Update Spacy's default stop words
for word in custom_stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True

# --- Define Clean Function ---
def clean_text(text):
    """
    Function to return cleaned token using nlp
    :param text: str
    :return: object
    """
    # --- STEP 1: AGGRESSIVE REGEX SANITIZATION ---
    # 1. Lowercase EVERYTHING first (catches HTTPS, Https, etc.)
    text = text.lower()

    # 2. Remove URLs (http/https/www)
    # \S+ means "keep deleting non-whitespace characters until you hit a space"
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)

    # 3. Remove Email Addresses
    text = re.sub(r'\S+@\S+', '', text)

    doc = nlp(text)

    allowed_pos = ["NOUN", "PROPN"]
    clean_tokens = []

    for token in doc:
        lemma = token.lemma_.lower()

        if(
            not token.is_stop
            and not token.is_punct
            and not token.like_num
            and token.pos_ in allowed_pos
            and lemma not in custom_stop_words
            and len(lemma) > 2
        ):
            clean_tokens.append(lemma)

    return list(set(clean_tokens))

In [11]:
df['cleaned_tokens'] = df['Description'].apply(clean_text)
df['processed_text'] = df['cleaned_tokens'].apply(lambda x: " ".join(x))

In [12]:
# Perform TD-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)
result = tfidf.fit_transform(df['processed_text'])
feature_names = np.array(tfidf.get_feature_names_out())

In [13]:
# 2. Function to get top N words for a single row
def get_top_keywords(row_vector, n=10):
    # Get indices of non-zero elements
    # .toarray() converts the sparse row to a dense row so we can sort
    dense_row = row_vector.toarray().flatten()

    # argsort gives us the indices that would sort the array (low to high)
    # We slice [-n:] to get the last N (highest scores) and reverse [::-1]
    top_indices = dense_row.argsort()[-n:][::-1]

    # Filter out zero-score indices (in case a doc has < N words)
    top_indices = [i for i in top_indices if dense_row[i] > 0]

    return feature_names[top_indices].tolist()

# 3. Apply this to every row in your matrix
# Note: 'result' is your sparse matrix from fit_transform
df['top_keywords'] = [get_top_keywords(result[i]) for i in range(result.shape[0])]

# 4. Check the output
print(df[['Position', 'top_keywords']].head())

      Position                                       top_keywords
0  AI Engineer  [processing operationalization, organization p...
1  AI Engineer  [corporation government, start, start battelle...
2  AI Engineer  [rag case, responsibility automation, vector, ...
3  AI Engineer  [variety integration, project model, infrastru...
4  AI Engineer  [date prototype, nlp refinement, refinement ca...


In [14]:
df['top_keywords']

0      [processing operationalization, organization p...
1      [corporation government, start, start battelle...
2      [rag case, responsibility automation, vector, ...
3      [variety integration, project model, infrastru...
4      [date prototype, nlp refinement, refinement ca...
                             ...                        
995    [date hour, architect hybrid, architecture wor...
996    [date hour, architect hybrid, architecture wor...
997    [date hour, architect hybrid, architecture wor...
998    [date hour, architect hybrid, architecture wor...
999    [date hour, architect hybrid, architecture wor...
Name: top_keywords, Length: 1000, dtype: object

In [15]:
# ---- THE MATCHER -----
class JobMatcher:
    """
    Class to implement the job matching logic
    """

    def __init__(self):
        # Load the model only when the class is initialized
        self.tfidf: TfidfVectorizer
        self.df: pd.DataFrame
        path = os.path.join(os.path.dirname("../models/model.pkl"), "model.pkl")
        with open(path, "rb") as fd:
            self.tfidf, self.tfidf_matrix, self.df = pickle.load(fd)

    def recommend(self, user_text, top_n=10):
        """
        Class method to implement cosine similarity logic to compare the
        "User Vector" against the "Job Vectors" and output a raw match score
        Args:
            user_text:
            top_n:

        Returns:

        """

        # Clean the User Input
        cleaned_input = clean_text(user_text)

        if isinstance(cleaned_input, list):
            cleaned_output = " ".join(cleaned_input)

        # Convert the User Input to Numbers (Vector)
        user_vector = self.tfidf.transform([cleaned_output])

        # Calculate the cosine similarity
        similarities = cosine_similarity(user_vector, self.tfidf_matrix).flatten()

        print(len(similarities))

        # Get Top N Matches
        top_indices = similarities.argsort()[-top_n:][::-1]
        print(top_indices)
        # Format Results
        results = []
        for pos in top_indices:
            score = similarities[pos]
            job = self.df.iloc[pos]
            results.append({
                "title": job["Position"],
                "company": job["Company"],
                "score": float(score),
                "match_reason": f"Match Score: {round(score * 100)}%"
            })

        return results

In [16]:
df_user = pd.read_csv("../data/aijobhunt_db.users.csv")

In [17]:
my_jobmatcher = JobMatcher()

In [18]:
text_columns = [
    'preferences.target_roles[0]',
    'preferences.skills[0]',
    'preferences.skills[1]',
    'preferences.skills[2]',
    'preferences.experience_level'
]
df_user['user_text'] = df_user[text_columns].fillna('').agg(' '.join, axis=1)

In [19]:
df_user['result'] = df_user['user_text'].apply(my_jobmatcher.recommend)

1000
[314 207 204 224 209  68  59 423 428 427]
1000
[208 470 708 751 780 603 203 502 503 134]
1000
[198 838 727 223 182 791  63  62 751 670]
1000
[198 744 251 838 664 221 461 627 505 311]
1000
[777 608 618 790 603 623 630 953 731 164]
1000
[118 367 709 712 953 467 103 749 733 315]
1000
[837 470 232 463 205 468 482 486 485 484]
1000
[627 709  68 475 452 780 462 664  56 450]
1000
[652 621 941 788 504 662 116 361 222 122]
1000
[950 368 357 506 940 941 227 832 505 339]


In [20]:
df_user

Unnamed: 0,_id,name,email,preferences.desired_locations[0],preferences.target_roles[0],preferences.skills[0],preferences.skills[1],preferences.skills[2],preferences.experience_level,preferences.salary_min,preferences.salary_max,created_at,updated_at,user_text,result
0,69825c91e48da677bdeaeb39,Alex Morgan,alex.morgan@test.com,Boston,Research Scientist,Experimental Design,Data Analysis,Python,senior,90000,130000,2026-02-03T20:37:37.173Z,,Research Scientist Experimental Design Data An...,"[{'title': 'Data Engineer', 'company': 'Robert..."
1,69825c91e48da677bdeaeb3a,Dr. Sophia Ramirez,sophia.ramirez@test.com,Chicago,Physician,Patient Care,Diagnostics,Clinical Research,senior,180000,260000,2026-02-03T20:37:37.173Z,,Physician Patient Care Diagnostics Clinical Re...,"[{'title': 'Data Scientist', 'company': 'Mayo ..."
2,69825c91e48da677bdeaeb3b,Marco Bianchi,marco.bianchi@test.com,New York,Executive Chef,Menu Design,Italian Cuisine,Team Leadership,mid,65000,95000,2026-02-03T20:37:37.173Z,,Executive Chef Menu Design Italian Cuisine Tea...,"[{'title': 'Software Engineers', 'company': 'I..."
3,69825c91e48da677bdeaeb3c,Emily Chen,emily.chen@test.com,San Jose,High School Teacher,Curriculum Development,Classroom Management,,mid,60000,90000,2026-02-03T20:37:37.174Z,,High School Teacher Curriculum Development Cla...,"[{'title': 'Software Engineers', 'company': 'I..."
4,69825c91e48da677bdeaeb3d,James O'Connor,james.oconnor@test.com,Denver,Electrician,Wiring,Troubleshooting,Safety Compliance,senior,70000,110000,2026-02-03T20:37:37.174Z,,Electrician Wiring Troubleshooting Safety Comp...,"[{'title': 'Datacenter Technician', 'company':..."
5,69825c91e48da677bdeaeb3e,Lena Fischer,lena.fischer@test.com,Remote,Graphic Designer,Adobe Illustrator,Brand Design,,junior,50000,75000,2026-02-03T20:37:37.174Z,,Graphic Designer Adobe Illustrator Brand Desig...,[{'title': 'Cybersecurity Analyst / Engineer (...
6,69825c91e48da677bdeaeb3f,Daniel Brooks,daniel.brooks@test.com,Atlanta,Marketing Manager,SEO,Campaign Strategy,Analytics,mid,85000,120000,2026-02-03T20:37:37.174Z,,Marketing Manager SEO Campaign Strategy Analyt...,"[{'title': 'Mobile App Developer', 'company': ..."
7,69825c91e48da677bdeaeb40,Aisha Khan,aisha.khan@test.com,Las Vegas,Hotel Operations Manager,Customer Service,Staff Management,,senior,75000,115000,2026-02-03T20:37:37.174Z,,Hotel Operations Manager Customer Service Staf...,"[{'title': 'Network Engineer', 'company': 'Lei..."
8,69825c91e48da677bdeaeb41,Robert Greene,robert.greene@test.com,Washington DC,Policy Analyst,Policy Research,Writing,Data Interpretation,mid,80000,120000,2026-02-03T20:37:37.174Z,,Policy Analyst Policy Research Writing Data In...,[{'title': 'UNPAID VOLUNTEER - Blockchain Soft...
9,69825c91e48da677bdeaeb42,Chris Johnson,chris.johnson@test.com,Seattle,Cloud Engineer,AWS,Terraform,Python,senior,140000,200000,2026-02-03T20:37:37.174Z,,Cloud Engineer AWS Terraform Python senior,"[{'title': 'Azure SRE', 'company': 'TEKsystems..."


In [21]:
df_user['result']

0    [{'title': 'Data Engineer', 'company': 'Robert...
1    [{'title': 'Data Scientist', 'company': 'Mayo ...
2    [{'title': 'Software Engineers', 'company': 'I...
3    [{'title': 'Software Engineers', 'company': 'I...
4    [{'title': 'Datacenter Technician', 'company':...
5    [{'title': 'Cybersecurity Analyst / Engineer (...
6    [{'title': 'Mobile App Developer', 'company': ...
7    [{'title': 'Network Engineer', 'company': 'Lei...
8    [{'title': 'UNPAID VOLUNTEER - Blockchain Soft...
9    [{'title': 'Azure SRE', 'company': 'TEKsystems...
Name: result, dtype: object