In [1]:
import pandas as pd

In [2]:
job_data_path = 'D:/7th_sem/Recommendation systems/job_recommendation/data/cleaned_job_data.csv'
jobs_df = pd.read_csv(job_data_path)
print(jobs_df)

       Job.ID  Provider Status  \
0         111         1   open   
1         113         1   open   
2         117         1   open   
3         121         1   open   
4         127         1   open   
...       ...       ...    ...   
81588      82         1   open   
81589      83         1   open   
81590      84         1   open   
81591      88         1   open   
81592      92         1   open   

                                                    Slug  \
0                        palo-alto-ca-tacolicious-server   
1        san-francisco-ca-claude-lane-kitchen-staff-chef   
2      san-francisco-ca-machka-restaurants-corp-barte...   
3                      brisbane-ca-teriyaki-house-server   
4      los-angeles-ca-rosa-mexicano-sunset-kitchen-st...   
...                                                  ...   
81588  san-francisco-ca-national-japanese-american-hi...   
81589       larkspur-ca-emporio-rulli-kitchen-staff-chef   
81590                san-francisco-ca-onigilly-driv

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#nltk.download('stopwords')
#nltk.download('wordnet')

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text (split into words)
    tokens = text.split()
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join the tokens back into a single string
    return ' '.join(tokens)

def process_in_batches(df, batch_size):
    processed_texts = []

    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch = df.iloc[start:end]
        
        # Apply preprocessing to the 'Job.Description' column
        processed_batch = batch['Job.Description'].apply(preprocess_text)
        processed_texts.extend(processed_batch)  # Collect results

    return processed_texts

# Define your batch size
batch_size = 1000 

# Process the DataFrame in batches
jobs_df['preprocessed_job_description'] = process_in_batches(jobs_df, batch_size)

# Display the DataFrame with the new column
print(jobs_df[['Job.Description', 'preprocessed_job_description']].head())


                                     Job.Description  \
0  Tacolicious' first Palo Alto store just opened...   
1   \r\n\r\nNew French Brasserie in S.F. Financia...   
2  We are a popular Mediterranean wine bar and re...   
3   ● Serve food/drinks to customers in a profess...   
4  Located at the heart of Hollywood, we are one ...   

                        preprocessed_job_description  
0  tacolicious first palo alto store opened recen...  
1  new french brasserie sf financial district see...  
2  popular mediterranean wine bar restaurant fina...  
3  serve fooddrinks customer professional manner ...  
4  located heart hollywood one popular mexican pl...  


In [9]:
jobs_df.to_csv("jobs_df_preprocessed.csv")

In [10]:
import pandas as pd
df = pd.read_csv("jobs_df_preprocessed.csv")
print(df.head(5))

   Unnamed: 0  Job.ID  Provider Status  \
0           0     111         1   open   
1           1     113         1   open   
2           2     117         1   open   
3           3     121         1   open   
4           4     127         1   open   

                                                Slug  \
0                    palo-alto-ca-tacolicious-server   
1    san-francisco-ca-claude-lane-kitchen-staff-chef   
2  san-francisco-ca-machka-restaurants-corp-barte...   
3                  brisbane-ca-teriyaki-house-server   
4  los-angeles-ca-rosa-mexicano-sunset-kitchen-st...   

                                         Title            Position  \
0                         Server @ Tacolicious              Server   
1             Kitchen Staff/Chef @ Claude Lane  Kitchen Staff/Chef   
2         Bartender @ Machka Restaurants Corp.           Bartender   
3                      Server @ Teriyaki House              Server   
4  Kitchen Staff/Chef @ Rosa Mexicano - Sunset  Kitchen Staf

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.sparse import vstack
import numpy as np

# Assuming your DataFrame is named df
# Step 1: Ensure all relevant columns are strings
df['Slug'] = df['Slug'].astype(str)
df['Title'] = df['Title'].astype(str)
df['Position'] = df['Position'].astype(str)
df['Company'] = df['Company'].astype(str)
df['preprocessed_job_description'] = df['preprocessed_job_description'].astype(str)

# Step 2: Combine the relevant text columns
df['combined_text'] = (df['Slug'] + ' ' + 
                       df['Title'] + ' ' + 
                       df['Position'] + ' ' + 
                       df['Company'] + ' ' + 
                       df['preprocessed_job_description']).str.strip()  # Remove any leading/trailing whitespace

# Step 3: Define batch size and batch processing for TF-IDF
batch_size = 2000
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features based on memory

# To store the TF-IDF vectors from all batches
tfidf_matrix = []

for start in range(0, len(df), batch_size):
    end = min(start + batch_size, len(df))
    batch_text = df['combined_text'].iloc[start:end]
    batch_tfidf = vectorizer.fit_transform(batch_text)  # Fit-transform batch
    tfidf_matrix.append(batch_tfidf)

# Step 4: Stack the sparse matrices into one large sparse matrix
tfidf_matrix = vstack(tfidf_matrix)

# Step 5: Apply K-Means Clustering
kmeans = KMeans(n_clusters=30, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)  # KMeans can accept sparse matrix directly

# Step 6: Add cluster labels to the DataFrame
df['cluster'] = clusters

# Step 7: Save the DataFrame with clusters to CSV
df.to_csv('jobs_with_clusters.csv', index=False)

print("Clustering complete! Results saved to 'jobs_with_clusters.csv'")

Clustering complete! Results saved to 'jobs_with_clusters.csv'


In [3]:
import pandas as pd
job_clusters_df = pd.read_csv("jobs_with_clusters.csv")
print(job_clusters_df.head(5))

   Unnamed: 0  Job.ID  Provider Status  \
0           0     111         1   open   
1           1     113         1   open   
2           2     117         1   open   
3           3     121         1   open   
4           4     127         1   open   

                                                Slug  \
0                    palo-alto-ca-tacolicious-server   
1    san-francisco-ca-claude-lane-kitchen-staff-chef   
2  san-francisco-ca-machka-restaurants-corp-barte...   
3                  brisbane-ca-teriyaki-house-server   
4  los-angeles-ca-rosa-mexicano-sunset-kitchen-st...   

                                         Title            Position  \
0                         Server @ Tacolicious              Server   
1             Kitchen Staff/Chef @ Claude Lane  Kitchen Staff/Chef   
2         Bartender @ Machka Restaurants Corp.           Bartender   
3                      Server @ Teriyaki House              Server   
4  Kitchen Staff/Chef @ Rosa Mexicano - Sunset  Kitchen Staf

In [4]:
import re
import json
import requests
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from time import time

# Fetch and process the keywords from the raw GitHub Gist URL
gist_url = "https://gist.githubusercontent.com/theikkila/596d1265ae086c6d1c5e/raw/keywords.json"
response = requests.get(gist_url)

# Process the keywords as JSON
skills_list = json.loads(response.text)
skills_list = [keyword.strip().lower() for keyword in skills_list if keyword.strip()]
skills_set = set(skills_list)  # Convert to a set for faster lookups

# Precompiled regex for faster performance
skills_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, skills_set)) + r')\b')

# Function to extract skills from a single text entry
def extract_skills(text):
    start_time = time()  # Start timing the extraction
    cleaned_text = text.lower()  # Convert text to lowercase
    cleaned_text = re.sub(r'[^\w\s]', ' ', cleaned_text)  # Remove punctuation
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Clean spaces
    # Find all skills in the text using the compiled regex
    found_skills = skills_pattern.findall(cleaned_text)
    execution_time = time() - start_time  # End timing
    print(f"extract_skills executed in {execution_time:.2f} seconds.")  # Profiling output
    return list(set(found_skills))  # Return unique found skills

# Function for batch processing with parallel execution
def extract_skills_batch_parallel(texts):
    start_time = time()  # Start timing batch processing
    extracted_skills_list = Parallel(n_jobs=-1)(delayed(extract_skills)(text) for text in texts)
    execution_time = time() - start_time  # End timing
    print(f"extract_skills_batch_parallel executed in {execution_time:.2f} seconds.")  # Profiling output
    return extracted_skills_list  # Return list of lists

# Function to create skill-cluster mapping and add it as a new column
def map_skills_to_clusters(job_data, batch_size=2000):
    num_batches = (len(job_data) + batch_size - 1) // batch_size  # Calculate number of batches
    all_mapped_skills = []  # List to hold all extracted skills for each job

    for i in range(num_batches):
        start_time = time()  # Start timing each batch
        batch_data = job_data.iloc[i * batch_size : (i + 1) * batch_size]  # Get the current batch
        texts = batch_data['combined_text'].tolist()  # Convert the column to a list
        
        # Batch process the extraction of skills in parallel
        mapped_skills = extract_skills_batch_parallel(texts)
        all_mapped_skills.extend(mapped_skills)  # Append the extracted skills to the main list

        execution_time = time() - start_time  # End timing
        print(f"Processed batch {i + 1}/{num_batches} in {execution_time:.2f} seconds.")  # Profiling output
    
    # Add the results to the DataFrame as a new column
    job_data['mapped_skills'] = all_mapped_skills
    
    return job_data

# Start profiling
start_time = time()

# Add the skill mappings to the DataFrame
job_clusters_df = map_skills_to_clusters(job_clusters_df, batch_size=2000)

# Save the updated DataFrame to CSV
job_clusters_df.to_csv('updated_job_clusters.csv', index=False)

end_time = time()
execution_time = end_time - start_time

print(f"Updated job listings with mapped skills saved to 'updated_job_clusters.csv'.")
print(f"Total execution time: {execution_time:.2f} seconds")


extract_skills_batch_parallel executed in 6.18 seconds.
Processed batch 1/41 in 6.18 seconds.
extract_skills_batch_parallel executed in 6.07 seconds.
Processed batch 2/41 in 6.07 seconds.
extract_skills_batch_parallel executed in 5.46 seconds.
Processed batch 3/41 in 5.46 seconds.
extract_skills_batch_parallel executed in 5.14 seconds.
Processed batch 4/41 in 5.14 seconds.
extract_skills_batch_parallel executed in 4.69 seconds.
Processed batch 5/41 in 4.70 seconds.
extract_skills_batch_parallel executed in 4.62 seconds.
Processed batch 6/41 in 4.62 seconds.
extract_skills_batch_parallel executed in 4.12 seconds.
Processed batch 7/41 in 4.12 seconds.
extract_skills_batch_parallel executed in 4.71 seconds.
Processed batch 8/41 in 4.71 seconds.
extract_skills_batch_parallel executed in 4.75 seconds.
Processed batch 9/41 in 4.75 seconds.
extract_skills_batch_parallel executed in 4.75 seconds.
Processed batch 10/41 in 4.75 seconds.
extract_skills_batch_parallel executed in 4.97 seconds.
Pro

In [1]:
import pandas as pd
job_skills_df=pd.read_csv("updated_job_clusters.csv")
print(job_skills_df.head(5))

   Unnamed: 0  Job.ID  Provider Status  \
0           0     111         1   open   
1           1     113         1   open   
2           2     117         1   open   
3           3     121         1   open   
4           4     127         1   open   

                                                Slug  \
0                    palo-alto-ca-tacolicious-server   
1    san-francisco-ca-claude-lane-kitchen-staff-chef   
2  san-francisco-ca-machka-restaurants-corp-barte...   
3                  brisbane-ca-teriyaki-house-server   
4  los-angeles-ca-rosa-mexicano-sunset-kitchen-st...   

                                         Title            Position  \
0                         Server @ Tacolicious              Server   
1             Kitchen Staff/Chef @ Claude Lane  Kitchen Staff/Chef   
2         Bartender @ Machka Restaurants Corp.           Bartender   
3                      Server @ Teriyaki House              Server   
4  Kitchen Staff/Chef @ Rosa Mexicano - Sunset  Kitchen Staf

In [7]:
import pandas as pd
import numpy as np
import time
from gensim.models import Word2Vec

# Load the job skills dataset
job_skills_df = pd.read_csv("updated_job_clusters.csv")

# Preprocess the job descriptions (split into words)
# Make sure that all descriptions are strings before splitting
job_skills_df['preprocessed_job_description'] = job_skills_df['preprocessed_job_description'].astype(str)
sentences = [desc.split() for desc in job_skills_df['preprocessed_job_description']]

# Start timing for embedding generation
start_time = time.time()

# Train a Word2Vec model (you can adjust the parameters as needed)
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Generate embeddings for each description (by averaging word vectors)
def get_embedding(desc):
    words = desc.split()
    # Get the word vectors for the words in the description
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if word_vectors:
        return np.mean(word_vectors, axis=0)  # Average the word vectors
    else:
        return np.zeros(model.vector_size)  # Return zero vector if no words are in the model

job_skills_df['embeddings'] = job_skills_df['preprocessed_job_description'].apply(get_embedding)

# Save the updated DataFrame with embeddings to a new CSV file
job_skills_df.to_csv("job_clusters_with_embeddings.csv", index=False)

# End timing
end_time = time.time()
print(f"Total time taken for processing: {end_time - start_time:.2f} seconds")


Total time taken for processing: 136.44 seconds


In [13]:
# Save the trained model
model.save("word2vec_model.bin")  # Save the model to a file

In [9]:
import pandas as pd
df_embeddings=pd.read_csv("job_clusters_with_embeddings.csv")
print(df_embeddings)


import pandas as pd
import numpy as np
# Function to convert the string representation to a NumPy array
def parse_embedding(embedding_str):
    # Remove the brackets and split by whitespace
    clean_str = embedding_str.strip('[]').replace("\n", "")
    # Convert the space-separated values into a NumPy array
    return np.fromstring(clean_str, sep=' ')

# Apply the function to the embeddings column
df_embeddings['embeddings'] = df_embeddings['embeddings'].apply(parse_embedding)

# Create a NumPy array from the embeddings
job_embeddings = np.array(df_embeddings['embeddings'].tolist())

# Verify the shape of job_embeddings
num_jobs, embedding_dimension = job_embeddings.shape
print(f"Number of Jobs: {num_jobs}, Embedding Dimension: {embedding_dimension}")
print(job_embeddings[:5])  # Print the first 5 embeddings to confirm


       Unnamed: 0  Job.ID  Provider Status  \
0               0     111         1   open   
1               1     113         1   open   
2               2     117         1   open   
3               3     121         1   open   
4               4     127         1   open   
...           ...     ...       ...    ...   
81588       81588      82         1   open   
81589       81589      83         1   open   
81590       81590      84         1   open   
81591       81591      88         1   open   
81592       81592      92         1   open   

                                                    Slug  \
0                        palo-alto-ca-tacolicious-server   
1        san-francisco-ca-claude-lane-kitchen-staff-chef   
2      san-francisco-ca-machka-restaurants-corp-barte...   
3                      brisbane-ca-teriyaki-house-server   
4      los-angeles-ca-rosa-mexicano-sunset-kitchen-st...   
...                                                  ...   
81588  san-francisco-ca-nat

In [5]:
import requests
import json
from gensim.models import Word2Vec

# Fetch the keywords from the raw GitHub Gist URL (raw version of the file)
gist_url = "https://gist.githubusercontent.com/theikkila/596d1265ae086c6d1c5e/raw/keywords.json"
response = requests.get(gist_url)

# Process the keywords as JSON
skills_list = json.loads(response.text)  # Parse the JSON
skills_list = [keyword.strip().lower() for keyword in skills_list if keyword.strip()]

# Prepare data for training Word2Vec
# Here we create sentences by treating each skill as a separate word
training_data = [[skill] for skill in skills_list]

# Train the Word2Vec model
model = Word2Vec(sentences=training_data, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec_model.bin")

print("Word2Vec model trained and saved successfully!")


Word2Vec model trained and saved successfully!


In [17]:
import panel as pn
import PyPDF2
from io import BytesIO
import requests
import re
import json
import numpy as np
from gensim.models import KeyedVectors  # Import for loading pre-trained Word2Vec model

# Step 1: Fetch the keywords from the raw GitHub Gist URL (raw version of the file)
gist_url = "https://gist.githubusercontent.com/theikkila/596d1265ae086c6d1c5e/raw/keywords.json"
response = requests.get(gist_url)

# Step 2: Process the keywords as JSON
skills_list = json.loads(response.text)  # Parse the JSON
skills_list = [keyword.strip().lower() for keyword in skills_list if keyword.strip()]

# Load pre-trained Word2Vec model (update with your actual model path)
model_path = "word2vec_model.bin"  # Update with the path where you saved the model
model = KeyedVectors.load(model_path)  # Load the model

# Function to extract skills from text
def extract_skills(text):
    extracted_skills = set()
    
    # Clean the extracted text by converting to lowercase, removing extra spaces, and removing punctuation
    clean_text = text.lower()  # Convert text to lowercase for case-insensitive matching
    clean_text = re.sub(r'[^\w\s]', ' ', clean_text)  # Remove punctuation
    clean_text = re.sub(r'\s+', ' ', clean_text)  # Replace multiple spaces with a single space

    # Try matching each skill in the clean text
    for skill in skills_list:
        if re.search(r'\b' + re.escape(skill) + r'\b', clean_text):
            extracted_skills.add(skill)
    
    return list(extracted_skills)

def get_embeddings(skills):
    embeddings = []
    for skill in skills:
        print(f"Checking skill: {skill}")  # Debug: Print each skill being checked
        if skill in model.wv.key_to_index:  # Check if skill is in the model
            embedding = model.wv[skill]  # Get the embedding
            embeddings.append(embedding)
            print(f"Embedding for '{skill}': {embedding}")  # Debug: Print embedding
        else:
            print(f"'{skill}' not found in the model.")  # Debug: Print missing skill
            embeddings.append(np.zeros(model.vector_size))  # Zero vector if skill is not in the model
    return embeddings

def display_embeddings(skills_found, stored_embeddings):
    """Display embeddings in the output area."""
    if stored_embeddings:
        embeddings_output.value = f"**Skill Embeddings (stored successfully):**\n\n" + \
            '\n'.join(f"- {skill}: {embedding.tolist()[:5]} (total {len(embedding)})" for skill, embedding in zip(skills_found, stored_embeddings))
    else:
        embeddings_output.value = "No embeddings found."

def handle_file_upload(event):
    global stored_embeddings  # Declare the variable as global to modify it
    uploaded_file = file_input.value
    if uploaded_file:
        try:
            # Read the PDF file
            pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file))
            text = ''
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + '\n'

            # Extract skills from the text
            extracted_skills = extract_skills(text)
            print("Skills Found:", extracted_skills)  # Debug: Print found skills

            # Display the skills in the output area
            if extracted_skills:
                skills_output.value = f"**Extracted Skills:**\n\n" + '\n'.join(f"- {skill}" for skill in extracted_skills)

                # Calculate embeddings for the extracted skills
                stored_embeddings = get_embeddings(extracted_skills)
                print("Stored Embeddings:", stored_embeddings)  # Debug: Print stored embeddings

                # Call the display function to show the embeddings immediately
                display_embeddings(extracted_skills, stored_embeddings)

                # Now that we have extracted skills, we can run the job recommendations
                recommended_jobs = recommend_jobs(extracted_skills)
                
                # Display the recommended jobs
                recommended_jobs_output.value = f"**Recommended Jobs:**\n\n" + '\n'.join(
                    [f"- {job['Title']} at {job['Company']} ({job['City']})\n  Fuzzy Score: {job['fuzzy_score']}, Job Readiness Score: {job['job_readiness_score']}" for index, job in recommended_jobs.iterrows()]
                )

            else:
                skills_output.value = "Extracted Skills = []"
                print("No skills found.")  # Debug: Print message if no skills are found

        except Exception as e:
            skills_output.value = f"Error reading file: {str(e)}"

# Create a new output area for displaying the recommended jobs
recommended_jobs_output = pn.pane.Markdown("", width=600)

# Update the dashboard layout to include the recommended jobs section
dashboard = pn.Column(
    file_input,
    extract_button,
    skills_output,
    embeddings_output,  # Add the embeddings output area
    recommended_jobs_output  # Add the recommended jobs output area
)

# Display the dashboard
dashboard.show()


Launching server at http://localhost:62209


<panel.io.server.Server at 0x1c76ebbd210>

Skills Found: ['courses', 'sql', 'ifrs', 'financial statements', 'auditing', 'marketing', 'golf', 'insurance', 'valuation', 'catering', 'expense reports', 'international', 'balance sheet', 'financial reporting', 'accruals', 'health', 'cash', 'assurance', 'us gaap', 'liability', 'reporting', 'certifications', 'income tax', 'documentation', 'photography', 'accounts receivable', 'cms', 'audit', 'telecommunications', 'gaap', 'fashion', 'translation', 'i', 'health insurance', 'hyperion', 'education', 'accounting', 'profit', 'investors', 'impact', 'excel', 'shipping', 'history', 'accounts payable', 'r', 'soccer', 'rates', 'currency exchange', 'tax', 'payroll', 'communication', 'tax returns', 'essbase', 'property', 'atm', 'clarity', 'arrangements', 'parts', 'international tax', 'internal', 'currency', 'administration', 'training', 'quickbooks', 'management', 'due diligence', 'internal controls', 'coordination', 'balance', 'travel', 'analysis', 'general ledger', 'bookkeeping', 'business']
Chec

In [18]:
import faiss
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
import json
import re
# Create the FAISS index
embedding_dimension = len(df_embeddings['embeddings'][0])  # Assuming all embeddings are of the same dimension
index = faiss.IndexFlatIP(embedding_dimension)  # Using inner product for cosine similarity

# Normalize the job embeddings before adding to the FAISS index
job_embeddings = np.array(df_embeddings['embeddings'].tolist()).astype('float32')
faiss.normalize_L2(job_embeddings)  # Normalize to unit length
index.add(job_embeddings) 

# Search for top N jobs
k = 100  # Increase or decrease based on your need
D, I = index.search(np.mean(job_embeddings, axis=0).reshape(1, -1), k)

# Fuzzy matching function
def calculate_fuzzy_scores(extracted_skills, mapped_skills):
    scores = []
    for job_skills in mapped_skills:
        job_skills_list = eval(job_skills) if isinstance(job_skills, str) else job_skills
        max_score = max([fuzz.token_set_ratio(skill, job_skill) for skill in extracted_skills for job_skill in job_skills_list], default=0)
        scores.append(max_score)
    return scores

# Apply fuzzy matching
top_jobs = df_embeddings.iloc[I[0]].copy()  # Use .copy() to avoid SettingWithCopyWarning
top_jobs.loc[:, 'fuzzy_score'] = calculate_fuzzy_scores(extracted_skills, top_jobs['mapped_skills'])
top_jobs.loc[:, 'job_readiness_score'] = top_jobs['fuzzy_score'] + (D[0] * 100)

# MMR function
def mmr(recommendations, diversity_param=0.5):
    selected_jobs = []
    while len(selected_jobs) < 10 and len(recommendations) > 0:
        scores = recommendations['job_readiness_score'].values
        selected_index = np.argmax(scores)
        selected_jobs.append(recommendations.iloc[selected_index])

        # Drop the selected job from recommendations using .iloc
        recommendations = recommendations.drop(recommendations.index[selected_index]).reset_index(drop=True)

        if len(recommendations) == 0:
            break
        
        similarities = []
        for _, job in recommendations.iterrows():
            job_vec = np.array(job['embeddings'])  # Convert to numpy array if needed
            selected_vec = np.array(selected_jobs[-1]['embeddings'])  # Same here
            mmr_score = (1 - diversity_param) * job['job_readiness_score'] - diversity_param * np.dot(job_vec, selected_vec)
            similarities.append(mmr_score)

        recommendations['job_readiness_score'] = similarities

    return pd.DataFrame(selected_jobs)

# Apply MMR
recommended_jobs = mmr(top_jobs)

# Display results
print("Top Job Recommendations:")
print(recommended_jobs[['Title', 'Position', 'Company', 'City', 'fuzzy_score', 'job_readiness_score']])

NameError: name 'extracted_skills' is not defined

In [24]:
import faiss
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz

# Assuming df_embeddings is defined and populated with job embeddings and mapped skills

# Create the FAISS index
embedding_dimension = len(df_embeddings['embeddings'][0])  # Assuming all embeddings are of the same dimension
index = faiss.IndexFlatIP(embedding_dimension)  # Using inner product for cosine similarity

# Normalize the job embeddings before adding to the FAISS index
job_embeddings = np.array(df_embeddings['embeddings'].tolist()).astype('float32')
faiss.normalize_L2(job_embeddings)  # Normalize to unit length
index.add(job_embeddings)

# Improved fuzzy matching function
def calculate_fuzzy_scores(extracted_skills, mapped_skills):
    scores = []
    for job_skills in mapped_skills:
        job_skills_list = eval(job_skills) if isinstance(job_skills, str) else job_skills
        skill_scores = [fuzz.token_set_ratio(skill, job_skill) for skill in extracted_skills for job_skill in job_skills_list]
        max_score = max(skill_scores, default=0)
        scores.append(max(max_score, 0))  # Ensure scores are non-negative
    return scores

# Function to get job recommendations based on extracted skills
def recommend_jobs(extracted_skills, top_n=10):
    # Calculate fuzzy scores for all jobs based on extracted skills
    top_jobs = df_embeddings.copy()  # Copy original dataframe for fuzzy matching
    top_jobs['fuzzy_score'] = calculate_fuzzy_scores(extracted_skills, top_jobs['mapped_skills'])

    # Calculate job readiness score
    top_jobs['job_readiness_score'] = (top_jobs['fuzzy_score'] * 0.7) + (1 - top_jobs['fuzzy_score'] / 100) * 0.3  # Modify weightings as needed

    # Sort jobs by readiness score
    top_jobs = top_jobs.sort_values(by='job_readiness_score', ascending=False)

    # Ensure diversity in recommendations using MMR
    selected_jobs = []
    diversity_param = 0.5

    while len(selected_jobs) < top_n and len(top_jobs) > 0:
        # Get current scores
        scores = top_jobs['job_readiness_score'].values
        selected_index = np.argmax(scores)
        selected_jobs.append(top_jobs.iloc[selected_index])

        # Drop the selected job from recommendations
        top_jobs = top_jobs.drop(top_jobs.index[selected_index]).reset_index(drop=True)

        if len(top_jobs) == 0:
            break
        
        # Update scores based on diversity
        similarities = []
        selected_vec = np.array(selected_jobs[-1]['embeddings'])

        for _, job in top_jobs.iterrows():
            job_vec = np.array(job['embeddings'])
            # Modify the MMR score calculation
            mmr_score = (1 - diversity_param) * job['job_readiness_score'] - diversity_param * np.dot(job_vec, selected_vec)
            similarities.append(mmr_score)

        top_jobs['job_readiness_score'] = similarities

        # Re-sort jobs by updated readiness score to get the next best job
        top_jobs = top_jobs.sort_values(by='job_readiness_score', ascending=False)

    return pd.DataFrame(selected_jobs)

In [25]:
recommended_jobs = recommend_jobs(extracted_skills)

# Output the recommended jobs
print("Recommended Jobs:")
print(recommended_jobs[['Title', 'Position', 'Company', 'City', 'fuzzy_score', 'job_readiness_score']])

Recommended Jobs:
                                                   Title  \
76038  Medical Technologist - PRN - Kindred Hospital ...   
699                 Accounts Payable Clerk @ Accountemps   
2431   Mental Health Therapist - House of Mercy PT.5 ...   
68596                     Seasonal - M-1 @ Quad Graphics   
81529  Registered Nurse - Emergency Dept.- Per diem @...   
12497  Sales & Inventory Manager (Temp) @ Brook Stree...   
1373   Tax Accountant - Contract @ Ledgent Finance & ...   
73439                  Occupational Therapist @ Amedisys   
8206                 Servers @ Red Robin Gourmet Burgers   
26011           Relief Tech @ Center for Family Services   

                                                Position  \
76038  Medical Technologist - PRN - Kindred Hospital ...   
699                               Accounts Payable Clerk   
2431   Mental Health Therapist - House of Mercy PT.5 ...   
68596                                     Seasonal - M-1   
81529       Registere

In [21]:
import faiss
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from sklearn.preprocessing import normalize

# Assuming df_embeddings is defined and populated with job embeddings and mapped skills

# Load job embeddings and prepare FAISS index
embedding_dimension = len(df_embeddings['embeddings'][0])  # Assuming each embedding is of the same length
index = faiss.IndexFlatIP(embedding_dimension)  # Using inner product (cosine similarity) for FAISS index

# Convert job embeddings to array and normalize them for cosine similarity
job_embeddings = np.array(df_embeddings['embeddings'].tolist()).astype('float32')
faiss.normalize_L2(job_embeddings)  # Normalize job embeddings for FAISS indexing
index.add(job_embeddings)

# Precompute fuzzy scores for all jobs based on extracted skills
def calculate_fuzzy_scores(extracted_skills, mapped_skills):
    scores = []
    for job_skills in mapped_skills:
        try:
            # Ensure job_skills is a valid list or string
            job_skills_list = eval(job_skills) if isinstance(job_skills, str) else job_skills
            if not job_skills_list:  # Check if the list is empty
                scores.append(0)  # Default to 0 if no skills are mapped
                continue
            
            # Calculate fuzzy scores for each skill
            skill_scores = [
                max([fuzz.token_set_ratio(skill, job_skill) for job_skill in job_skills_list])
                for skill in extracted_skills
            ]
            avg_score = np.mean(skill_scores) if skill_scores else 0
            scores.append(avg_score)
        except Exception as e:
            print(f"Error processing job_skills: {job_skills} - {e}")
            scores.append(0)  # Assign 0 in case of any error
    
    return scores

# MMR for diversity in recommendations
def mmr_selection(top_jobs, top_n=10, diversity_param=0.5):
    selected_jobs = []
    selected_indices = []
    
    while len(selected_jobs) < top_n and len(top_jobs) > 0:
        if not selected_jobs:
            # Select the highest scoring job first
            selected_index = top_jobs['job_readiness_score'].idxmax()
        else:
            best_score = -float('inf')
            selected_index = None

            # Compare remaining jobs for diversity
            for idx, job_row in top_jobs.iterrows():
                relevance = job_row['job_readiness_score']
                diversity = max(
                    fuzz.token_set_ratio(job_row['Position'], sel_job['Position']) for sel_job in selected_jobs
                )
                mmr_score = (1 - diversity_param) * relevance - diversity_param * diversity
                if mmr_score > best_score:
                    best_score = mmr_score
                    selected_index = idx
        
        # Add the selected job to the final list
        selected_jobs.append(top_jobs.loc[selected_index])
        selected_indices.append(selected_index)
        top_jobs = top_jobs.drop(selected_index)

    return pd.DataFrame(selected_jobs)

# Recommend jobs based on extracted skills
def recommend_jobs(extracted_skills, df_embeddings, top_n=10, diversity_param=0.5):
    top_jobs = df_embeddings.copy()  # Copy original DataFrame for processing

    # Precompute fuzzy matching scores
    top_jobs['fuzzy_score'] = calculate_fuzzy_scores(extracted_skills, top_jobs['mapped_skills'])

    # Precompute job readiness score
    top_jobs['job_readiness_score'] = (0.7 * top_jobs['fuzzy_score']) + (0.3 * np.random.uniform(0, 100, len(top_jobs)))

    # Sort jobs by job readiness score
    top_jobs = top_jobs.sort_values(by='job_readiness_score', ascending=False)

    # Apply MMR to ensure diversity and get the final recommended jobs
    recommended_jobs = mmr_selection(t4op_jobs, top_n=top_n, diversity_param=diversity_param)

    return recommended_jobs

# Assuming extracted_skills is populated from a previous step (e.g., text extraction)
extracted_skills = ['python', 'data analysis', 'machine learning', 'deep learning']  # Example extracted skills

# Recommend top N jobs based on the extracted skills
top_n = 10  # Number of jobs to recommend
recommended_jobs = recommend_jobs(extracted_skills, df_embeddings, top_n=top_n)

# Output the recommended jobs
print(recommended_jobs[['Title', 'Position', 'Company', 'City', 'job_readiness_score']])

                                                   Title  \
25616  Summer Intern - Data Scientist @ Black Knight ...   
39517                    Cook @ Brightview Senior Living   
30545  LPN @ Carriage Court of Hilliard, A Good Neigh...   
15271                      HHA @ BAYADA HOME HEALTH CARE   
17032                      MSW @ BAYADA HOME HEALTH CARE   
56180  Tutor (366-008) @ Lindamood-Bell Learning Proc...   
73534                         Supervisor @ XPO Last Mile   
18112  Quickbooks Full Charge Bookkeeper @ Ledgent Se...   
69250      Oncology Liaison II @ The US Oncology Network   
76260  Software Engineer @ Beacon Hill Staffing Group...   

                                Position  \
25616     Summer Intern - Data Scientist   
39517                               Cook   
30545                                LPN   
15271                                HHA   
17032                                MSW   
56180                    Tutor (366-008)   
73534                         S