In [1]:
!pip install pandas



In [2]:
%%writefile download_huggingface_model.py

from sentence_transformers import SentenceTransformer

# download pretrained
model = SentenceTransformer('all-MiniLM-L6-v2')

# save to local directory
model.save('./model/')

Overwriting download_huggingface_model.py


In [3]:
%%writefile create_embeddings.py


import pickle
from sentence_transformers import SentenceTransformer

skill_list_file = r'./master_skills_list.txt' # skill names from ChatGPT
skill_emb_file = r'./master_emb_list.pkl' # output file
model_path = r'./model' # the path to the downloaded model

# use the model we downloaded in the model directory
model = SentenceTransformer(model_path)

# read in the skill names
with open(skill_list_file, 'r') as f:
    lines = f.readlines()
    master_skills_list = []
    for l in lines:
        master_skills_list.append(l.replace("\n", ""))

# create the embeddings and write it as a pickle file
master_skill_embs = model.encode(master_skills_list)
with open(skill_emb_file, 'wb') as f:
    pickle.dump(master_skill_embs, f)


Overwriting create_embeddings.py


In [4]:
%%writefile recommend_without_cloudrun.py

import os
import sys
import pandas as pd
import pypdf
import pickle
import re
from sentence_transformers import SentenceTransformer, util
import torch
import werkzeug
import numpy as np


# Use a pre-trained model from Hugging Face
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Check if a GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedder.to(device)

def read_pdf_text(pdf_path):
    '''
    Parses a PDF file and returns the contents.
    '''
    file_text = ''
    with open(pdf_path, 'rb') as f:
        pdf = pypdf.PdfReader(f)
        for page in range(len(pdf.pages)):
            file_text += (pdf.pages[page].extract_text())
    return file_text


def cut_and_clean(string):
    '''
    Cut up text into smaller pieces for the model to read and clean the pieces.
    '''
    chunks = re.split(r'\n|\.', string)
    chunks = [x for x in chunks if len(x) > 4]
    c_chunks = list()
    for i in chunks:
        i = ''.join((x for x in i if not x.isdigit()))  # throw away digits
        i = re.sub(r'[^a-zA-Z0-9 \n\.,]', ' ', i)  # throw away special characters
        i = " ".join(i.split())  # remove extra spaces
        i = i.lower()  # lowercase
        if len(i.split()) > 3:
            c_chunks.append(i)
    return c_chunks


def match_snippets(snippets, master_phrase_embs, master_phrase_list, top_k):
    '''
    Match a list of short phrases to a set of phrase embeddings.
    '''
    skill_recommendation = pd.DataFrame()
    for query in snippets:
        query_embedding = embedder.encode(query.strip(), convert_to_tensor=True, device=device)
        cos_scores = util.cos_sim(query_embedding, master_phrase_embs)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        skills_list = list()
        score_list = list()
        for score, idx in zip(top_results.values.cpu().numpy(), top_results.indices.cpu().numpy()):
            skills_list.append(master_phrase_list[idx])
            score_list.append(score.item())
        
        skills_df = pd.DataFrame(skills_list)
        score_df = pd.DataFrame(score_list)
        sk_sc_df = pd.concat([skills_df, score_df], axis=1)
        sk_sc_df.columns = ['Phrase', 'Score']
        skill_recommendation = pd.concat([skill_recommendation, sk_sc_df]).reset_index(drop=True)
    
    return skill_recommendation



def main(input_file, master_skills_emb_binary, master_skills_list, top_k):

    '''
    Save a set of suggestions for skills from a CV.
    '''
    with open(master_skills_emb_binary, 'rb') as f:
        master_phrase_embs = pickle.load(f)
    
    # Convert numpy array to PyTorch tensor and move to the appropriate device
    master_phrase_embs = torch.tensor(master_phrase_embs).to(device)
    
    with open(master_skills_list, 'r') as f:
        lines = f.readlines()
        master_phrase_list = [l.replace("\n", "") for l in lines]
    
    file_text = read_pdf_text(input_file)
    cv_snippets = cut_and_clean(file_text)
    skill_recommendation = match_snippets(cv_snippets, master_phrase_embs, master_phrase_list, top_k=top_k)
    skill_recommendation = skill_recommendation[skill_recommendation['Score'] >= 0.5]
    skill_recommendation = skill_recommendation.sort_values('Score', ascending=False)
    skill_recommendation = skill_recommendation.drop_duplicates(subset='Phrase').reset_index(drop=True)
    skill_recommendation = skill_recommendation.rename(columns={'Phrase': 'Skill'})
    skill_recommendation.to_csv(os.path.splitext(input_file)[0] + '_skill_suggestions.csv', index=False)


if __name__ == "__main__":
    if 'ipykernel_launcher.py' in sys.argv[0]:
        sys.argv = [arg for arg in sys.argv if not arg.endswith('.json')]
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file')
    parser.add_argument('--master_skills_emb_binary', required=False, default=r'./master_emb_list.pkl')
    parser.add_argument('--master_skills_list', required=False, default=r'./master_skills_list.txt')
    parser.add_argument('--top_k', required=False, default=5, type=int)  # Ensure top_k is parsed as an integer
    args = parser.parse_args()
    main(input_file=args.input_file, master_skills_emb_binary=args.master_skills_emb_binary, master_skills_list=args.master_skills_list, top_k=args.top_k)


Overwriting recommend_without_cloudrun.py


In [12]:
%%writefile main.py

import pandas as pd
import pypdf
import pickle
import re
from sentence_transformers import SentenceTransformer, util
import torch
from flask import Flask, request, json, Response
from flask_restx import Api, Resource, fields, abort
import werkzeug
import numpy as np

app = Flask(__name__)
api = Api(app)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

file_upload_parser = api.parser()
file_upload_parser.add_argument('file', location='files', type=werkzeug.datastructures.FileStorage, required=True)

embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)  # Ensure you are using the same model

def read_pdf_text(file_path):
    pdf_reader = pypdf.PdfReader(file_path)
    text = ''
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def cut_and_clean(text):
    # Assuming that the cut_and_clean function splits text into snippets and cleans them
    snippets = re.split(r'\n+', text)
    snippets = [snippet.strip() for snippet in snippets if snippet.strip()]
    return snippets

def match_snippets(snippets, master_phrase_embs, master_phrase_list, top_k=5):
    # Embed snippets
    snippet_embs = embedder.encode(snippets, convert_to_tensor=True, device=device)

    # Print shapes for debugging
    print(f"Shape of snippet_embs: {snippet_embs.shape}")
    print(f"Shape of master_phrase_embs: {master_phrase_embs.shape}")

    # Calculate cosine similarity
    cos_scores = util.pytorch_cos_sim(snippet_embs, master_phrase_embs)
    
    # Ensure top_k is correctly set
    top_k = min(top_k, len(master_phrase_list))  # Limit top_k to the number of available skills
    
    top_results = torch.topk(cos_scores, k=top_k, dim=1)

    recommendations = []
    for idx, snippet in enumerate(snippets):
        for score, idx in zip(top_results.values.cpu().numpy()[idx], top_results.indices.cpu().numpy()[idx]):
            recommendations.append({'Snippet': snippet, 'Phrase': master_phrase_list[idx], 'Score': score})
    
    # Print recommendations for debugging
    print(f"Recommendations: {recommendations}")

    return pd.DataFrame(recommendations)

@api.route('/skills_from_cv')
class SkillsFromCV(Resource):
    @api.expect(file_upload_parser)
    def post(self, top_k=5):
        args = file_upload_parser.parse_args()
        input_file = args['file']
        input_file.save('file.pdf')

        master_skills_emb_binary = r'./master_emb_list.pkl'
        master_skills_list = r'./master_skills_list.txt'

        with open(master_skills_emb_binary, 'rb') as f:
            master_phrase_embs = pickle.load(f)
        with open(master_skills_list, 'r') as f:
            lines = f.readlines()
            master_phrase_list = [l.replace("\n", "") for l in lines]

        # Convert numpy array to PyTorch tensor and move to the appropriate device
        master_phrase_embs = torch.tensor(master_phrase_embs).to(device)

        file_text = read_pdf_text('file.pdf')
        cv_snippets = cut_and_clean(file_text)
        skill_recommendation = match_snippets(cv_snippets, master_phrase_embs, master_phrase_list, top_k=top_k)

        # Additional debug prints
        print(f"Skill recommendation before filtering: {skill_recommendation}")

        skill_recommendation = skill_recommendation[skill_recommendation['Score'] >= 0.3]
        skill_recommendation = skill_recommendation.sort_values('Score', ascending=False)
        skill_recommendation = skill_recommendation.drop_duplicates(subset='Phrase').reset_index(drop=True)
        skill_recommendation = skill_recommendation.rename(columns={'Phrase': 'Skill'})
        skill_recommendation = skill_recommendation.replace({np.nan: None})

        # Additional debug prints
        print(f"Skill recommendation after filtering: {skill_recommendation}")

        response = {'recommendations': skill_recommendation.to_dict(orient='records')}
        return response

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=8080)



Overwriting main.py
