# Non-Tech skills Matcher

In [None]:
import pandas as pd
from tqdm import tqdm
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

import faiss
import numpy as np
from sklearn.preprocessing import normalize
import pickle

class SkillMatcher:
    
    def __init__(self, neo4j_uri, neo4j_username, neo4j_password, openai_model="text-embedding-3-large", genai_model="models/embedding-001"):
        self.openai_embeddings_model = OpenAIEmbeddings(model=openai_model)
        self.genai_embeddings = GoogleGenerativeAIEmbeddings(model=genai_model)
        self.neo4j_uri = neo4j_uri
        self.neo4j_username = neo4j_username
        self.neo4j_password = neo4j_password
        self.vector_index = Neo4jVector.from_existing_graph(
            self.genai_embeddings,
            url=self.neo4j_uri,
            username=self.neo4j_username,
            password=self.neo4j_password,
            index_name='skills',
            node_label="Skill",
            text_node_properties=['skill_name', 'description'],
            embedding_node_property='embedding',
        )

    def get_openai_embeddings(self, texts):
        embeddings = self.openai_embeddings_model.embed_documents(texts)
        return np.array(embeddings)

    def build_faiss_index(self, index_list):
        index_vectors = self.get_openai_embeddings(index_list)
        index_vectors = normalize(index_vectors, norm='l2')
        dimension = index_vectors.shape[1]
        index = faiss.IndexFlatIP(dimension)
        index.add(index_vectors)
        return index

    def save_index_to_memory(self, faiss_index):
        return faiss.serialize_index(faiss_index)

    def load_index_from_memory(self, serialized_index):
        return faiss.deserialize_index(serialized_index)

    def fetch_top_k_results(self, faiss_index, index_list, query, k=5):
        query_vector = self.get_openai_embeddings([query])[0].reshape(1, -1)
        query_vector = normalize(query_vector, norm='l2')
        distances, top_indices = faiss_index.search(query_vector, k)
        top_results = [(index_list[i], float(dist)) for i, dist in zip(top_indices[0], distances[0])]
        return top_results

    def fetch_partial_score(self, expanded_skill_lst_1, expanded_skill_lst_2):
        partial_match_lst = []
        index_list = expanded_skill_lst_1
        faiss_index = self.build_faiss_index(index_list)
        serialized_index = self.save_index_to_memory(faiss_index)
        loaded_faiss_index = self.load_index_from_memory(serialized_index)

        for skill in tqdm(expanded_skill_lst_2, desc="finding partial match"):
            results = self.fetch_top_k_results(loaded_faiss_index, index_list, skill, 2)
            partial_match_lst.extend([{'match': skill, 'result': result} for result in results if result[1] > 0.4])

        partial_score_sum = sum(val['result'][1] for val in partial_match_lst)
        return partial_score_sum

    def expand_skill_lst(self, name, skill_lst):
        expanded_skill_lst = []
        for skill in tqdm(skill_lst, desc=f"expanding {name}"):
            skill_lst = []
            result = self.vector_index.similarity_search_with_score(skill, k=5)
            skill_lst = [val[0].page_content.split('\n')[1].split(':')[-1].strip(' ') for val in result if val[1] > 0.8]
            expanded_skill_lst.extend(skill_lst)
        return expanded_skill_lst

    def exact_match_calculator(self, expanded_company_skill_lst, expanded_candidate_skill_lst):
        numerator_count = 0
        denominator_count = len(expanded_company_skill_lst)
        removed_skills = []
        
        for skill_1 in expanded_company_skill_lst:
            if skill_1 in expanded_candidate_skill_lst:
                numerator_count += 1
                removed_skills.append(skill_1)

        expanded_company_skill_lst = list(set(expanded_company_skill_lst) - set(removed_skills))
        expanded_candidate_skill_lst = list(set(expanded_candidate_skill_lst) - set(removed_skills))
        return expanded_company_skill_lst, expanded_candidate_skill_lst, numerator_count, denominator_count

    def score_calculator(self, company_skill_lst, candidate_skill_lst):
        expanded_company_skill_lst = list(set(self.expand_skill_lst('company_skill_lst', company_skill_lst)))
        expanded_candidate_skill_lst = list(set(self.expand_skill_lst('candidate_skill_lst', candidate_skill_lst)))

        expanded_company_skill_lst, expanded_candidate_skill_lst, numerator_count, denominator_count = self.exact_match_calculator(expanded_company_skill_lst, expanded_candidate_skill_lst)
        partial_match_score_sum = self.fetch_partial_score(expanded_company_skill_lst, expanded_candidate_skill_lst)

        final_score = (numerator_count + partial_match_score_sum) / denominator_count
        return final_score


In [None]:
 company_skill_lst  = [
    "Financial analysis",
    "Investment management",
    "Portfolio management",
    "Risk management",
    "Asset allocation",
    "Equity research",
    "Fixed income analysis",
    "Quantitative analysis",
    "Performance measurement",
    "Investment strategy development",
    "Market analysis",
    "Financial modeling",
    "Valuation techniques",
    "Investment selection",
    "Asset pricing"
]

candidate_skill_lst = [
    "Financial analysis",
    "Investment management",
    "Risk management skills",
    "Asset allocation",
    "Equity research",
    "Quantitative analysis expert",
    "Investment strategy development",
    "Market analysis",
    "expert in Financial modeling",
    "Good in Valuation techniques",
    "Asset pricing"
]


final_score = score_calculator(company_skill_lst,candidate_skill_lst,uri,user,password)
final_score