Implementation of the Prototype

Input Module

In [2]:
def load_job_description(file_path):
    try:
        with open(file_path, 'r') as file:
            text = file.read()
        return text
    except Exception as e:
        print(f"Error: {e}")
        return None

# Example Usage
#job_description = load_job_description("/content/sample_data/data_scientist_jobs.txt")
job_description = load_job_description("/home/lateefat/Automated Search Strategy Generation/data/prototype_test.txt")
print(job_description)


Job Title: Data Scientist

Location: New York

Job Description: XYZ Tech Solutions is seeking a Data Scientist to join our growing analytics team. The ideal candidate will have strong technical expertise in Python, SQL, and Machine Learning frameworks, along with experience in building predictive models and data-driven solutions.

Skills: Python, SQL, Machine Learning, TensorFlow, Scikit-learn, AWS, Tableau.



Preprocessing Module

In [3]:
import re
import spacy

def preprocess_text(text):
    nlp = spacy.load("en_core_web_sm")
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove special characters
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return tokens

# Example
cleaned_tokens = preprocess_text(job_description)
print(cleaned_tokens)


['job', 'title', 'datum', 'scientist', '\n\n', 'location', 'new', 'york', '\n\n', 'job', 'description', 'xyz', 'tech', 'solution', 'seek', 'data', 'scientist', 'join', 'grow', 'analytic', 'team', 'ideal', 'candidate', 'strong', 'technical', 'expertise', 'python', 'sql', 'machine', 'learning', 'framework', 'experience', 'build', 'predictive', 'model', 'datadriven', 'solution', '\n\n', 'skill', 'python', 'sql', 'machine', 'learn', 'tensorflow', 'scikitlearn', 'aws', 'tableau', '\n']


Keyword Extraction Module

Named Entity Recognition (NER)

In [4]:
def extract_entities(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = {"Job Title": [], "Skills": [], "Location": []}

    for ent in doc.ents:
        if ent.label_ == "ORG" or ent.label_ == "TITLE":
            entities["Job Title"].append(ent.text)
        elif ent.label_ == "GPE":  # Geographical Entity
            entities["Location"].append(ent.text)
        elif ent.label_ == "SKILL" or "NN":  # Add domain-specific labels
            entities["Skills"].append(ent.text)
    return entities

# Example Usage
entities = extract_entities(job_description)
print("Extracted Entities:", entities)


Extracted Entities: {'Job Title': ['Data Scientist', 'SQL', 'SQL', 'AWS'], 'Skills': ['Machine Learning', 'Machine Learning'], 'Location': ['New York', 'Python', 'Tableau']}


TF-IDF for Keyword Extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords_tfidf(text, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0])
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return [word for word, score in sorted_scores[:top_n]]

# Example Usage
keywords = extract_keywords_tfidf(job_description)
print("Top Keywords:", keywords)


Top Keywords: ['data', 'job', 'learning', 'machine', 'python']


Contextual Embeddings (BERT)

In [6]:
from transformers import pipeline

def summarize_text_bert(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=20, min_length=10, do_sample=False)
    return summary[0]['summary_text']

# Example Usage
summary = summarize_text_bert(job_description)
print("BERT Summary:", summary)


  from .autonotebook import tqdm as notebook_tqdm
2025-02-24 13:06:07.350204: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-24 13:06:07.371801: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-24 13:06:07.554256: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-24 13:06:07.719703: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740398767.872366   43918 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has

KeyboardInterrupt: 

Query Generation Module

In [14]:
def generate_search_query(entities, keywords):
    job_title = ' AND '.join([f'"{title}"' for title in entities["Job Title"]])
    skills = ' OR '.join([f'"{skill}"' for skill in entities["Skills"] + keywords])
    location = ' AND '.join([f'"{loc}"' for loc in entities["Location"]])
    return f"({job_title}) AND ({skills}) AND ({location})"

# Example Usage
search_query = generate_search_query(entities, keywords)
print("Generated Query:", search_query)


Generated Query: ("XYZ Tech Solutions" AND "XYZ Tech Solutions" AND "Data Scientist" AND "SQL" AND "Develop" AND "EDA" AND "SQL" AND "Communicate" AND "Power BI" AND "AI" AND "SQL" AND "Machine Learning" AND "PyTorch" AND "Power BI" AND "Data Science, Computer Science, Statistics" AND "NLP" AND "Spark" AND "Hadoop") AND ("Machine Learning" OR "Pandas" OR "Required Skills" OR "TensorFlow" OR "Matplotlib" OR "2+ years" OR "$100,000 - $130,000" OR "data" OR "experience" OR "learning" OR "machine" OR "skills") AND ("New York" AND "Python" AND "Tableau" AND "Python" AND "Tableau" AND "New York")
