In [136]:
import pandas as pd
from tqdm import tqdm
from neo4j import GraphDatabase
import os
from dotenv import load_dotenv

load_dotenv()
uri = os.getenv('uri')
user = os.getenv('user')
password = os.getenv('password')

driver= GraphDatabase.driver(uri, auth=(user, password))

### Loading ESCO datasets

In [143]:
occupations = pd.read_csv('ESCO_data/occupations.csv',dtype=str).fillna('').map(lambda x:x.lower())
isco_groups = pd.read_csv('ESCO_data/ISCOGroups.csv',dtype=str).fillna('').map(lambda x:x.lower())
occupations_hierarchy = pd.read_csv('ESCO_data/occupations_hierarchy.csv',dtype=str).fillna('').map(lambda x:x.lower())
skills_data = pd.read_csv('ESCO_data/skills.csv',dtype=str).fillna('').map(lambda x:x.lower())
skills_occupations_data = pd.read_csv('ESCO_data/occupation_skill_relations.csv',dtype=str).fillna('').map(lambda x:x.lower())

skills_occupations_data.sample(5)

Unnamed: 0,OCCUPATIONTYPE,OCCUPATIONID,RELATIONTYPE,SKILLID
29581,escooccupation,key_15911,essential,key_7329
33174,escooccupation,key_16007,essential,key_5809
47087,escooccupation,key_16318,essential,key_4428
49047,escooccupation,key_16360,optional,key_1951
62530,escooccupation,key_16690,optional,key_3388


## Ingesting data

In [1]:
from neo4j import GraphDatabase
from tqdm import tqdm

class Neo4jHandler:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def run_query(self, query):
        with self.driver.session() as session:
            result = session.run(query)
            return [record.data() for record in result]

    def create_occupation_nodes(self, df):
        failed_rows = []
        for _, row in tqdm(df.iterrows()):
            try:
                query = f"""
                MERGE (c:Occupation {{key_id: '{row['ID']}'}})  
                ON CREATE SET 
                  c.occupation_name = "{row['PREFERREDLABEL']}",
                  c.origin_url = coalesce('{row['ORIGINURI']}', ""),
                  c.code = coalesce('{row['CODE']}', ""),
                  c.description = coalesce("{row['DESCRIPTION'].replace('"','')}", "")
                """
                self.run_query(query)
            except Exception as e:
                print(f"Failed to process row: {row}, Error: {e}")
                failed_rows.append(row)
        return failed_rows

    def create_isco_group_nodes(self, df):
        failed_rows = []
        for _, row in tqdm(df.iterrows()):
            try:
                query = f"""
                MERGE (c:ISCO_group {{key_id: '{row['ID']}'}})  
                ON CREATE SET 
                  c.occupation_name = "{row['PREFERREDLABEL']}",
                  c.origin_url = coalesce('{row['ORIGINURI']}', ""),
                  c.code = coalesce('{row['CODE']}', ""),
                  c.description = coalesce("{row['DESCRIPTION'].replace('"','')}", "")
                """
                self.run_query(query)
            except Exception as e:
                print(f"Failed to process row: {row}, Error: {e}")
                failed_rows.append(row)
        return failed_rows

    def create_isco_hierarchy(self, df):
        failed_rows = []
        for _, row in tqdm(df.iterrows()):
            try:
                query = f"""
                MATCH (a:ISCO_group {{key_id: '{row['PARENTID']}'}}), (b:ISCO_group {{key_id: '{row['CHILDID']}'}})
                MERGE (a)<-[r:HAS_PARENT_OCCUPATION]-(b)
                """
                self.run_query(query)
            except Exception as e:
                print(f"Failed to process row: {row}, Error: {e}")
                failed_rows.append(row)
        return failed_rows

    def create_occupation_isco_hierarchy(self, df):
        failed_rows = []
        for _, row in tqdm(df.iterrows()):
            try:
                query = f"""
                MATCH (a:ISCO_group {{key_id: '{row['PARENTID']}'}}), (b:Occupation {{key_id: '{row['CHILDID']}'}})
                MERGE (a)<-[r:HAS_PARENT_OCCUPATION]-(b)
                """
                self.run_query(query)
            except Exception as e:
                print(f"Failed to process row: {row}, Error: {e}")
                failed_rows.append(row)
        return failed_rows

    def create_occupation_hierarchy(self, df):
        failed_rows = []
        for _, row in tqdm(df.iterrows()):
            try:
                query = f"""
                MATCH (a:Occupation {{key_id: '{row['PARENTID']}'}}), (b:Occupation {{key_id: '{row['CHILDID']}'}})
                MERGE (a)<-[r:HAS_PARENT_OCCUPATION]-(b)
                """
                self.run_query(query)
            except Exception as e:
                print(f"Failed to process row: {row}, Error: {e}")
                failed_rows.append(row)
        return failed_rows

    def create_skill_nodes(self, df):
        failed_rows = []
        for _, row in tqdm(df.iterrows()):
            try:
                query = f"""
                MERGE (c:Skill {{key_id: '{row['ID']}'}})  
                ON CREATE SET 
                  c.skill_name = "{row['PREFERREDLABEL']}",
                  c.origin_url = coalesce('{row['ORIGINURI']}', ""),
                  c.skill_type = coalesce('{row['SKILLTYPE']}', ""),
                  c.description = coalesce("{row['DESCRIPTION'].replace('"','')}", "")
                """
                self.run_query(query)
            except Exception as e:
                print(f"Failed to process row: {row}, Error: {e}")
                failed_rows.append(row)
        return failed_rows

    def create_skill_occupation_relation(self, df):
        failed_rows = []
        for _, row in tqdm(df.iterrows()):
            try:
                query = f"""
                MATCH (a:Occupation {{key_id: '{row['OCCUPATIONID']}'}}), (b:Skill {{key_id: '{row['SKILLID']}'}})
                MERGE (a)-[r:HAS_SKILL]->(b)
                SET r.type = '{row['RELATIONTYPE']}'
                """
                self.run_query(query)
            except Exception as e:
                print(f"Failed to process row: {row}, Error: {e}")
                failed_rows.append(row)
        return failed_rows


In [None]:
## ingesting Occupations
neo4j_handler = Neo4jHandler(uri, user, password)

failed_rows_occupation = neo4j_handler.create_occupation_nodes(occupations)

failed_rows_isco_group = neo4j_handler.create_isco_group_nodes(isco_groups) 

failed_rows_isco_group_hierarchy = neo4j_handler.create_isco_hierarchy(occupations_hierarchy) 
failed_rows_occupation_isco_hierarchy = neo4j_handler.create_occupation_isco_hierarchy(occupations_hierarchy)
failed_rows_occupation_hierarchy = neo4j_handler.create_occupation_hierarchy(occupations_hierarchy)

failed_row_skill = neo4j_handler.create_skill_nodes(skills_data)
failed_rows_skill_occupation_relation = neo4j_handler.create_skill_occupation_relation(skills_occupations_data)
neo4j_handler.close()

## Vector index creation on skill 

In [3]:
from langchain.vectorstores.neo4j_vector import Neo4jVector

## If OPENAI key is present
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = "text-embedding-3-small" 

## If Google API key is present 
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [7]:
graph_vector_index = Neo4jVector.from_existing_graph(
    embedding = embeddings,
    url=uri,
    username=user,
    password=password,
    index_name='skills',
    node_label="Skill",
    text_node_properties=['skill_name','description'],
    embedding_node_property='embedding',
)

In [59]:
hr_topics = [
    "Employee relations management",
    "Recruitment and selection processes",
    "Performance management systems",
    "Compensation and benefits administration",
    "Labor law and employment regulations",
    "Training and development programs",
    "Conflict resolution and negotiation skills"
]


sales_skills = [
    "Customer relationship management (CRM)",
    "Sales strategies and techniques",
    "Negotiation and persuasion",
    "Product knowledge and presentation",
    "Market and competitor analysis",
    "Closing sales and follow-ups",
    "Communication and interpersonal skills"
]

topics = [
    "Financial analysis",
    "Investment management",
    "Portfolio management",
    "Risk management",
    "Asset allocation",
    "Equity research",
    "Fixed income analysis",
    "Quantitative analysis",
    "Performance measurement",
    "Investment strategy development",
    "Market analysis",
    "Financial modeling",
    "Valuation techniques",
    "Investment selection",
    "Asset pricing"
]

# topics = [
#     "Financial analysis"]

In [62]:
total_skills = []
for skill in topics:
    skill_lst = []
    result = vector_index.similarity_search(skill, k=3)
    skill_lst = [val.page_content.split('\n')[1].split(':')[-1].strip(' ') for val in result]
    total_skills.extend(skill_lst)

In [63]:
total_skills = list(set(total_skills))
total_skills

# topics = [
#     "Financial analysis",
#     "Investment management",
#     "Portfolio management",
#     "Risk management",
#     "Asset allocation",
#     "Equity research",
#     "Fixed income analysis",
#     "Quantitative analysis",
#     "Performance measurement",
#     "Investment strategy development",
#     "Market analysis",
#     "Financial modeling",
#     "Valuation techniques",
#     "Investment selection",
#     "Asset pricing"
# ]


['economics',
 'follow a brief',
 'job market offers',
 'risk management',
 'manage financial risk',
 'manage several projects',
 'implement hedging strategies for clients',
 'parts pricing',
 'monitor stock market',
 'make investment decisions',
 'monitor bond market',
 'market analysis',
 'manage commercial risks',
 'perform market research',
 'market participants',
 'meet productivity targets',
 'games rules',
 'manage securities trading',
 'cloud monitoring and reporting',
 'operate financial instruments',
 'assist in fund management',
 'manage profitability',
 'advise on investment',
 'review investment portfolios',
 'perform financial analysis on price strategies',
 'analyse experimental laboratory data']

In [49]:
neo4j_handler = Neo4jHandler(uri, user, password)
occupation_dict = {}
non_matched_skills = []
for skill in total_skills:
    
    try:
        query = f'''MATCH (s:Skill {{skill_name:'{skill}'}})<-[:HAS_SKILL]-(o:Occupation) RETURN DISTINCT(o.occupation_name)'''
        result = neo4j_handler.run_query(query)
        occupation_lst = []
        occupation_lst = [val['(o.occupation_name)'] for val in result]
        
        for occupation in occupation_lst:
            if occupation in occupation_dict.keys():
                occupation_dict[occupation] += 1
            elif occupation not in occupation_dict.keys():
                occupation_dict[occupation] = 1
    except:
        non_matched_skills.append(skill)


In [50]:
top_3_jobs = sorted(occupation_dict.items(), key=lambda x: x[1], reverse=True)[:5]

# Print the top 3 keys and their values
for job, count in top_3_jobs:
    print(f"{job}: {count}")

human resources manager: 7
service manager: 6
corporate training manager: 5
marketing manager: 5
special-interest groups' official: 4


In [51]:
query = f'''MATCH (o:Occupation{{occupation_name:"human resources manager"}}) RETURN DISTINCT(o.description)'''
result = neo4j_handler.run_query(query)

In [52]:
result

[{'(o.description)': "human resources managers plan, design and implement processes related to the human capital of companies. they develop programs for recruiting, interviewing, and selecting employees based on a previous assessment of the profile and skills required in the company. moreover, they manage compensation and development programs for the company's employees comprising trainings, skill assessment and yearly evaluations, promotion, expat programs, and general assurance of the well-being of the employees in the workplace."}]

## Comparing Non-Tech skills

In [16]:
import os
import faiss
from uuid import uuid4
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "AIzaSyDi1T5Lwry7oKpl2htCsfoqfWwEJt4A1BE"

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

faiss_vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [17]:
documents = [Document(page_content=val) for val in total_skills]
uuids = [str(uuid4()) for _ in range(len(documents))]

faiss_vector_store.add_documents(documents=documents, ids=uuids)

['5fceef78-3ac5-47c6-b8c2-8542b1661cd8',
 '117a9e1b-1a81-4561-bc29-e5a05fad31f5',
 '39412f6e-e51f-40a1-9a0e-b631c3aa7698',
 'a39ca2d1-f061-43db-8fa9-14feb0a1c084',
 'bedf65e8-4a47-4f1b-8613-752507f245e6',
 'f85950a3-cdbc-4cb2-a606-823d350180e6',
 'e0a73e10-7709-4c6e-b8b6-e4aa9d060983',
 'aaf33c22-4877-4bdf-bb97-55a20f43bfa7',
 'cdd07443-859b-4ed5-b36c-e0ad8dedbb9e',
 '6381e1c6-e604-4bda-a91f-86e2315ea64e',
 '88307d49-4c2d-46dd-9907-54ca5a428171',
 '2335f78d-d8f2-45b2-bd80-82e57d22b736',
 '54ace2bf-919e-4f05-8723-b5319fe6d88a',
 '93db8c89-ccbc-4bec-98af-3a90a606314d',
 'e9e995db-e048-44f0-ad6c-e92260ad152b',
 '0bdf895f-16e2-4073-9956-e90173609f6f',
 'e47b3037-703f-455c-9253-109c5f3117ce',
 'c8049fb9-fec8-43f2-a08b-7dff766afcee',
 '8d83d16c-6e38-41cb-8065-7bafaed2febf',
 '4e60471e-c781-4e22-b227-9758f2ecfbd2',
 '3dc4ae99-3e31-4db4-8415-67ea9688c879',
 'bab56166-e5b1-4060-92d7-1f4073c8504e',
 '95d73d65-5892-4259-8717-28876f763f02',
 '0e413d13-55b2-4695-bae6-e46085bedf29',
 '8439a0f3-e9ae-

In [20]:
results = faiss_vector_store.similarity_search_with_score(
    "perform market ", k=2
)
for res, score in results:
    print(f" {res.page_content} , Score: {score:3f}")

 market analysis , Score: 0.126280
 market participants , Score: 0.136373


In [24]:
 skills_lst_1  = [
    "Financial analysis",
    "Investment management",
    "Portfolio management",
    "Risk management",
    "Asset allocation",
    "Equity research",
    "Fixed income analysis",
    "Quantitative analysis",
    "Performance measurement",
    "Investment strategy development",
    "Market analysis",
    "Financial modeling",
    "Valuation techniques",
    "Investment selection",
    "Asset pricing"
]

skills_lst_2 = [
    "Financial analysis",
    "Investment management",
    "Risk management skills",
    "Asset allocation",
    "Equity research",
    "Quantitative analysis expert",
    "Investment strategy development",
    "Market analysis",
    "expert in Financial modeling",
    "Good in Valuation techniques",
    "Asset pricing"
]


In [25]:
import os
import faiss
from uuid import uuid4
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS

In [26]:


def expand_skill_lst(skill_lst):
    expanded_skill_lst = []
    for skill in skills_lst_1:
        skill_lst = []
        result = vector_index.similarity_search(skill, k=4)
        skill_lst = [val.page_content.split('\n')[1].split(':')[-1].strip(' ') for val in result]
        total_skills.extend(skill_lst)

    expanded_skill_lst = list(set(total_skills))
    return expanded_skill_lst

expanded_skill_lst_1 = expand_skill_lst(skills_lst_1)
expanded_skill_lst_2 = expand_skill_lst(skills_lst_2)

In [27]:
numerator_count = 0
denominator_count = 0
for skill_1 in expanded_skill_lst_1:
    if skill_1 in expanded_skill_lst_2:
        numerator_count += 1
        denominator_count += 1
        expanded_skill_lst_1.remove(skill_1)
        expanded_skill_lst_2.remove(skill_1)
        continue

print(numerator_count,' ',denominator_count)

17   17


In [29]:
def score_normalizer(val: float) -> float:
    return 1 - 1 / (1 + np.exp(val))

##using google_api
def faiss_index_constructor(skill_lst):
    
    if "GOOGLE_API_KEY" not in os.environ:
        os.environ["GOOGLE_API_KEY"] = "AIzaSyDi1T5Lwry7oKpl2htCsfoqfWwEJt4A1BE"
    
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
    # index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
    
    faiss_vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )

    documents = [Document(page_content=val) for val in total_skills]
    uuids = [str(uuid4()) for _ in range(len(documents))]
    print(documents)
    faiss_vector_store.add_documents(documents=documents, ids=uuids)

    return faiss_vector_store

##using openAI
def faiss_index_constructor(skill_lst):
    
    if "OPENAI_API_KEY" not in os.environ:
        os.environ["OPENAI_API_KEY"] = "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"
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

    
    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
    
    faiss_vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        relevance_score_fn=score_normalizer,
    )

    documents = [Document(page_content=val) for val in total_skills]
    uuids = [str(uuid4()) for _ in range(len(documents))]
    faiss_vector_store.add_documents(documents=documents, ids=uuids)

    return faiss_vector_store

In [30]:
faiss_vector_store = faiss_index_constructor(expanded_skill_lst_1)

In [36]:
def retrieve_from_faiss(skill_name):
    def score_normalizer(val: float) -> float:
        return 1 - 1 / (1 + np.exp(val))

    
    results = faiss_vector_store.similarity_search_with_score(
        skill_name, k=1
    )
    for res, score in results:
        matched_skill = res.page_content
        score = score

    return matched_skill,score

retrieve_from_faiss('manage several projects')

('manage several projects', 1.8191947e-06)

In [33]:
expanded_skill_lst_1

['financial engineering',
 'job market offers',
 'risk management',
 'manage several projects',
 'implement hedging strategies for clients',
 'monitor stock market',
 'monitor bond market',
 'market analysis',
 'training  subject expertise',
 'market participants',
 'manage securities trading',
 'provide support in financial calculation',
 'operate financial instruments',
 'advise on investment',
 'perform financial analysis on price strategies',
 'similitude',
 'stock market']

## Using FAISS

In [42]:
import faiss
import numpy as np
from sklearn.preprocessing import normalize
from langchain.embeddings import OpenAIEmbeddings
import pickle 

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")

def get_openai_embeddings(texts):
    embeddings = embeddings_model.embed_documents(texts)
    return np.array(embeddings)


def build_faiss_index(index_list):
    index_vectors = get_openai_embeddings(index_list)
    
    # Normalize vectors to use cosine similarity
    index_vectors = normalize(index_vectors, norm='l2')
    
    dimension = index_vectors.shape[1]  
    index = faiss.IndexFlatIP(dimension)  # Using Inner Product (IP) for cosine similarity
    index.add(index_vectors)  # Add index vectors to the FAISS index
    
    return index

def save_index_to_memory(faiss_index):
    return faiss.serialize_index(faiss_index)

def load_index_from_memory(serialized_index):
    return faiss.deserialize_index(serialized_index)

def fetch_top_k_results(faiss_index, index_list, query, k=5):
    query_vector = get_openai_embeddings([query])[0].reshape(1, -1)  
    
    query_vector = normalize(query_vector, norm='l2')
    
    distances, top_indices = faiss_index.search(query_vector, k)  
    top_results = [(index_list[i], float(dist)) for i, dist in zip(top_indices[0], distances[0])]
    
    return top_results

index_list = skills_lst_1

faiss_index = build_faiss_index(index_list)
serialized_index = save_index_to_memory(faiss_index)

loaded_faiss_index = load_index_from_memory(serialized_index)

In [44]:
query = "Risk management skills"
k = 3  

results = fetch_top_k_results(loaded_faiss_index, index_list, query, k)
print(results)


[('Risk management', 0.8069169521331787), ('Portfolio management', 0.4653838574886322), ('Investment management', 0.4617082476615906)]


In [None]:
skills_company  = [
    "Financial analysis",
    "Investment management",
    "Portfolio management",
    "Risk management",
    "Asset allocation",
    "Equity research",
    "Fixed income analysis",
    "Quantitative analysis",
    "Performance measurement",
    "Investment strategy development",
    "Market analysis",
    "Financial modeling",
    "Valuation techniques",
    "Investment selection",
    "Asset pricing"
]

skills_candidate_1 = [
    "Financial analysis",
    "Investment management",
    "Risk management skills",
    "Asset allocation",
    "Equity research",
    "Quantitative analysis expert",
    "Investment strategy development",
    "Market analysis",
    "expert in Financial modeling",
    "Good in Valuation techniques",
    "Asset pricing"
]

skills_candidate_2 = [
    "Financial analysis",
    "Investment management",
    "Risk management skills",
    "Asset allocation",
    "Quantitative analysis expert",
    "Investment strategy development",
    "Market analysis",
    "Good in Valuation techniques",
    "Asset pricing"
]
