In [103]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import en_core_web_sm
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle
import re

In [2]:
df = pd.read_csv('serpapi_20260124_00_32_00.csv')

In [3]:
df.dtypes

Company         object
Position        object
Location        object
Tags            object
Description     object
URL             object
Salary_Min     float64
Salary_Max     float64
Date           float64
ID              object
dtype: object

In [4]:
nlp = en_core_web_sm.load()

In [5]:
all_text = "".join(df['Description'].astype('str'))

In [6]:
doc = nlp(all_text, disable=['parser', 'ner'])

In [7]:
words = [
    token.lemma_.lower() for token in doc
    if not token.is_stop
    and not token.is_punct
    and token.is_alpha
]

In [10]:
word_freq = Counter(words)

In [16]:
print(f"Total unique words found: {len(word_freq)}")
print("Top 50 most frequent words (Potential Stop Words): ")
df_word_freq = pd.DataFrame(word_freq.most_common(50))
df_word_freq.columns = ['Stop Words', 'Frequency']
print(df_word_freq)

Total unique words found: 4368
Top 50 most frequent words (Potential Stop Words): 
     Stop Words  Frequency
0    experience        720
1      software        653
2          work        536
3          team        474
4   development        347
5        system        343
6       include        305
7        design        280
8      engineer        274
9      position        246
10         role        244
11        build        239
12     employee        235
13  engineering        227
14         time        224
15  opportunity        221
16         base        216
17         year        211
18      product        208
19      support        204
20          pay        202
21      benefit        201
22   technology        198
23      develop        196
24    technical        192
25      provide        189
26  application        188
27     security        185
28        skill        184
29  environment        172
30      require        171
31          job        162
32      service        155

In [104]:
# Define the custom list based on the EDA Analysis Results
custom_stop_words = [
    # Hiring / Generic
    "job", "description", "role", "seek", "position", "candidate", "ideal",
    "opportunity", "join", "client", "company", "new", "type", "remote",
    "experience", "work", "year", "skill", "require", "requirement", "include",
    "need", "strong", "ability", "knowledge", "responsible",

    # HR / Benefits
    "pay", "benefit", "salary", "range", "employee", "disability", "equal",
    "time", "base", "status",

    # Vague Verbs
    "provide", "focus", "drive", "collaborate", "support", "build", "help",
    "create", "maintain", "perform",

    # Generic Tech Context (Too broad to be useful)
    "solution", "system", "environment", "platform", "product", "service",
    "technology", "technical", "application", "industry", "high", "software",
    "engineer", "engineering", "development", "develop",

    # NEW: Recruiting & Process
    "interview", "recruiter", "prospect", "candidate", "select", "review",
    "meet", "touch", "region", "status", "fill", "join", "process", "aspect",

    # NEW: Corporate Fluff & Adjectives
    "impact", "fast", "pace", "inspire", "excite", "excited", "successful",
    "dynamic", "demanding", "challenge", "varied", "culture", "passion",
    "mission", "critical", "commercial", "good", "excellent", "solid",
    "expert", "proficiently", "minimum", "related", "specific", "wide",
    "array", "proven", "track", "record", "strong", "deep", "outcome",
    "real", "thinker", "acuman", "acumen", "important", "fundamental",

    # NEW: Benefits & Legal
    "insurance", "medical", "life", "retirement", "tax", "free", "saving",
    "plan", "healthcare", "incentive", "compensation", "eligible",
    "discretionary", "bonus", "bachelor", "degree", "discipline", "stem",
    "accordance", "applicable", "law", "legal", "compliance", "regulatory",
    "addition", "program", "fund", "funding", "settlement", "investor",

    # NEW: Generic Verbs/Nouns
    "look", "know", "prove", "manage", "solve", "participate", "align",
    "increase", "maximize", "iterate", "define", "spec", "change", "flex",
    "course", "pre", "gen", "desk", "partner", "team", "task", "problem",
    "dissect", "return", "efficiency", "research", "analysis", "power",

    # Generic Nouns
    "skill", "skills", "talent", "level", "following", "access",
    "aspect", "impact", "prospect", "outcome", "change", "course",
    "desk", "gen", "spec", "pre", "market", "interview", "seniority",

    "https", "http", "com", "www", "career", "careers", "apply",
    "website", "location", "locations", "email", "contact",
    "toast", "toasttab", "restaurant"
]

# Update Spacy's default stop words
for word in custom_stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True

# --- Define Clean Function ---
def clean_text(text):
    """
    Function to return cleaned token using nlp
    :param text: str
    :return: object
    """
    # --- STEP 1: AGGRESSIVE REGEX SANITIZATION ---
    # 1. Lowercase EVERYTHING first (catches HTTPS, Https, etc.)
    text = text.lower()

    # 2. Remove URLs (http/https/www)
    # \S+ means "keep deleting non-whitespace characters until you hit a space"
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)

    # 3. Remove Email Addresses
    text = re.sub(r'\S+@\S+', '', text)

    doc = nlp(text)

    allowed_pos = ["NOUN", "PROPN"]
    clean_tokens = []

    for token in doc:
        lemma = token.lemma_.lower()

        if(
            not token.is_stop
            and not token.is_punct
            and not token.like_num
            and token.pos_ in allowed_pos
            and lemma not in custom_stop_words
            and len(lemma) > 2
        ):
            clean_tokens.append(lemma)

    return list(set(clean_tokens))

In [105]:
df['cleaned_tokens'] = df['Description'].apply(clean_text)

In [106]:
df['processed_text'] = df['cleaned_tokens'].apply(lambda x: " ".join(x))

In [123]:
# Perform TD-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.85,
    sublinear_tf=True
)

In [124]:
result = tfidf.fit_transform(df['processed_text'])

In [125]:
feature_names = np.array(tfidf.get_feature_names_out())

In [126]:
# 2. Function to get top N words for a single row
def get_top_keywords(row_vector, n=10):
    # Get indices of non-zero elements
    # .toarray() converts the sparse row to a dense row so we can sort
    dense_row = row_vector.toarray().flatten()

    # argsort gives us the indices that would sort the array (low to high)
    # We slice [-n:] to get the last N (highest scores) and reverse [::-1]
    top_indices = dense_row.argsort()[-n:][::-1]

    # Filter out zero-score indices (in case a doc has < N words)
    top_indices = [i for i in top_indices if dense_row[i] > 0]

    return feature_names[top_indices].tolist()

# 3. Apply this to every row in your matrix
# Note: 'result' is your sparse matrix from fit_transform
df['top_keywords'] = [get_top_keywords(result[i]) for i in range(result.shape[0])]

# 4. Check the output
print(df[['Position', 'top_keywords']].head())

                                            Position  \
0                                  Software Engineer   
1  Software Engineer III - ERP Finance (Oracle/PL...   
2                           Software Engineer-Intern   
3  Senior Software Engineer (AI-Enhanced Developm...   
4  Experienced Software Engineer across a wide ar...   

                                        top_keywords  
0  [trading, investment, researcher, teamwork, ri...  
1  [spectrum, test object, reach spectrum, advert...  
2  [reason hour, lockheed master, clearance term,...  
3  [copilot, assistant core, english, usage desig...  
4  [mvc, number typescript, plate, redux, node, b...  


In [127]:
df['top_keywords']

0     [trading, investment, researcher, teamwork, ri...
1     [spectrum, test object, reach spectrum, advert...
2     [reason hour, lockheed master, clearance term,...
3     [copilot, assistant core, english, usage desig...
4     [mvc, number typescript, plate, redux, node, b...
                            ...                        
95    [frontend issue, individual project, customiza...
96    [tool tracking, artifact, test object, lifecyc...
97    [design entertainment, implementation initiati...
98    [dplyr, history working, term iteration, terra...
99    [developer francisco, react infrastructure, py...
Name: top_keywords, Length: 100, dtype: object

In [128]:
df['Description'][10]

"Who We Are Toast is driven by building the restaurant platform that helps restaurants adapt, take control, and get back to what they do best: building the businesses they love. Because of our unique scale, we are positioned to provide mission-critical financial services directly to the hospitality industry. Are you bready* for a change? We are looking for a Senior Front-End Engineer to help us architect and grow the next generation of these essential financial products. This team is specifically focused on empowering restaurant owners to pay their bills with ease, visualize their financial health in real-time, and automate their bookkeeping workflows. You will be building the interfaces that turn complex financial data into actionable business insights. About this roll* (Responsibilities) • Join a collaborative team of Design, Product, and QA to develop robust and scalable applications • Lead architectural discussions and influence the technical roadmap for our financial visualization

In [129]:
df['top_keywords'][10]

['liability',
 'ingredient',
 'humility test',
 'control potential',
 'roll earning',
 'ease',
 'roll',
 'customer humility',
 'practice understanding',
 'recipe penalty']

In [130]:
df['processed_text'][10]

'interaction control potential usd sale philosophy roadmap mentorship member bill quality diversity ease connection bar hiring component react business collaboration kotlin inclusivity today customer humility test exposure generation insight employer person commission discussion code success value states commitment equity detector accommodation goal bookkeeping interface roll earning web tool reward condition model communication responsibility advocate end health feedback api liability practice understanding lifestyle mean employment respect implementation overtime recipe penalty strategy autonomy owner ingredient workflow passionate visualization toaster automation principle domain topic package massachusetts hospitality inclusion user feature community guidance design authenticity scale reliability people datum production flexibility cash united lie'