In [1]:
#pip install sentence-transformers

In [2]:
# Import necessary libraries
import pandas as pd
import pdfplumber
from sentence_transformers import SentenceTransformer, util
import torch
import nltk
from nltk.corpus import stopwords
import re
import spacy

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jessicahsieh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jessicahsieh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize Sentence-BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Load pre-trained SpaCy model for NER
nlp = spacy.load('en_core_web_sm')

# Define a simple list of skills for demonstration purposes
skill_set = {"python", "data analysis", "machine learning", "project management", "communication"}



In [4]:
# Define function to extract entities using NER
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'PERSON', 'GPE', 'NORP', 'FAC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE']]
    return ' '.join(entities)


In [5]:
# Define function to extract skills from text
def extract_skills(text):
    words = set(text.split())
    skills = skill_set.intersection(words)
    return ' '.join(skills)

In [6]:
# Define function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    cleaned_text = ' '.join(words)
    
    # Extract entities and skills
    entities = extract_entities(cleaned_text)
    skills = extract_skills(cleaned_text)
    
    return cleaned_text + ' ' + entities + ' ' + skills


In [7]:
# Define function to extract text from a PDF file
def extract_text_from_pdf(file_path, max_pages=None):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        num_pages = len(pdf.pages)
        if max_pages:
            num_pages = min(max_pages, num_pages)
        for i in range(num_pages):
            page = pdf.pages[i]
            text += page.extract_text() + " "
    return text.strip()


In [8]:
# Define function to embed text using Sentence-BERT
def embed_text(text):
    return model.encode(text, convert_to_tensor=True)

In [9]:
# Define function to find top matches between CV and job descriptions
def find_top_matches(cv_text, job_descriptions, top_n=3):
    cv_embedding = embed_text(cv_text)
    job_embeddings = [embed_text(desc) for desc in job_descriptions]
    
    similarity_scores = [util.pytorch_cos_sim(cv_embedding, job_embedding).item() for job_embedding in job_embeddings]
    top_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:top_n]
    top_matches = [(similarity_scores[index], index) for index in top_indices]
    
    return top_matches


In [10]:
# Define function to load and prepare job descriptions data
def load_and_prepare_data(csv_path):
    jobs_df = pd.read_csv(csv_path)
    jobs_df.dropna(subset=['company_name', 'title', 'description'], inplace=True)
    return jobs_df

In [11]:
# Define function to sample job descriptions
def sample_job_descriptions(jobs_df, sample_size=3000):
    sampled_jobs_df = jobs_df.sample(min(sample_size, len(jobs_df)), random_state=1)
    sampled_jobs_df['description'] = sampled_jobs_df['description'].apply(preprocess_text)
    return sampled_jobs_df

In [12]:
# Define the main function to run the entire process
def main(cv_path, csv_path):
    # Load and prepare job descriptions data
    jobs_df = load_and_prepare_data(csv_path)

    # Sample job descriptions
    sampled_jobs_df = sample_job_descriptions(jobs_df)

    # Extract text from the CV and preprocess it
    cv_text = preprocess_text(extract_text_from_pdf(cv_path))

    # Find top matches
    top_matches = find_top_matches(cv_text, sampled_jobs_df['description'].tolist(), top_n=3)
    
    # Print top matches
    for i, (score, index) in enumerate(top_matches, start=1):
        company_name = sampled_jobs_df.iloc[index]['company_name']
        title = sampled_jobs_df.iloc[index]['title']
        description = sampled_jobs_df.iloc[index]['description']
        print(f"Similarity: {score * 100:.2f}%")
        print(f"Title: {title}")
        print(f"Company: {company_name}")
        print(f"Description: {description[:200]}...")  # Display the first 200 characters of each match
        print("\n" + "="*80 + "\n")

In [13]:
# Paths to the CV and job descriptions CSV file
cv_path = 'Ziyad.pdf'
csv_path = 'postings.csv'

# Run the main function
main(cv_path, csv_path)

FileNotFoundError: [Errno 2] No such file or directory: 'postings.csv'