In [136]:
import pandas as pd
from tqdm import tqdm
from neo4j import GraphDatabase
import os
from dotenv import load_dotenv

load_dotenv()
uri = os.getenv('uri')
user = os.getenv('user')
password = os.getenv('password')

driver= GraphDatabase.driver(uri, auth=(user, password))

In [59]:
## sample skills to test

hr_topics = [
    "Employee relations management",
    "Recruitment and selection processes",
    "Performance management systems",
    "Compensation and benefits administration",
    "Labor law and employment regulations",
    "Training and development programs",
    "Conflict resolution and negotiation skills"
]


sales_skills = [
    "Customer relationship management (CRM)",
    "Sales strategies and techniques",
    "Negotiation and persuasion",
    "Product knowledge and presentation",
    "Market and competitor analysis",
    "Closing sales and follow-ups",
    "Communication and interpersonal skills"
]

topics = [
    "Financial analysis",
    "Investment management",
    "Portfolio management",
    "Risk management",
    "Asset allocation",
    "Equity research",
    "Fixed income analysis",
    "Quantitative analysis",
    "Performance measurement",
    "Investment strategy development",
    "Market analysis",
    "Financial modeling",
    "Valuation techniques",
    "Investment selection",
    "Asset pricing"
]


In [None]:
from langchain.vectorstores.neo4j_vector import Neo4jVector

## If OPENAI key is present
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = "text-embedding-3-small" 

## If Google API key is present 
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

graph_vector_index = Neo4jVector.from_existing_graph(
    embedding = embeddings,
    url=uri,
    username=user,
    password=password,
    index_name='skills',
    node_label="Skill",
    text_node_properties=['skill_name','description'],
    embedding_node_property='embedding',
)

In [62]:
total_skills = []
for skill in topics:
    skill_lst = []
    result = graph_vector_index.similarity_search(skill, k=3)
    skill_lst = [val.page_content.split('\n')[1].split(':')[-1].strip(' ') for val in result]
    total_skills.extend(skill_lst)

total_skills = list(set(total_skills))

#### Appropriate Occupation extraction from skills

In [49]:
neo4j_handler = Neo4jHandler(uri, user, password)
occupation_dict = {}
non_matched_skills = []
for skill in total_skills:
    
    try:
        query = f'''MATCH (s:Skill {{skill_name:'{skill}'}})<-[:HAS_SKILL]-(o:Occupation) RETURN DISTINCT(o.occupation_name)'''
        result = neo4j_handler.run_query(query)
        occupation_lst = []
        occupation_lst = [val['(o.occupation_name)'] for val in result]
        
        for occupation in occupation_lst:
            if occupation in occupation_dict.keys():
                occupation_dict[occupation] += 1
            elif occupation not in occupation_dict.keys():
                occupation_dict[occupation] = 1
    except:
        non_matched_skills.append(skill)


top_k_jobs = sorted(occupation_dict.items(), key=lambda x: x[1], reverse=True)[:5]

# Print the top k keys and their values
for job, count in top_k_jobs:
    print(f"{job}: {count}")


In [51]:
## Fetching of description for occupation

query = f'''MATCH (o:Occupation{{occupation_name:"human resources manager"}}) RETURN DISTINCT(o.description)'''
result = neo4j_handler.run_query(query)
print(result)

## Comparing Non-Tech skills

In [24]:
 skills_lst_1  = [
    "Financial analysis",
    "Investment management",
    "Portfolio management",
    "Risk management",
    "Asset allocation",
    "Equity research",
    "Fixed income analysis",
    "Quantitative analysis",
    "Performance measurement",
    "Investment strategy development",
    "Market analysis",
    "Financial modeling",
    "Valuation techniques",
    "Investment selection",
    "Asset pricing"
]

skills_lst_2 = [
    "Financial analysis",
    "Investment management",
    "Risk management skills",
    "Asset allocation",
    "Equity research",
    "Quantitative analysis expert",
    "Investment strategy development",
    "Market analysis",
    "expert in Financial modeling",
    "Good in Valuation techniques",
    "Asset pricing"
]


In [25]:
import os
import faiss
from uuid import uuid4
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS

In [26]:


def expand_skill_lst(skill_lst):
    expanded_skill_lst = []
    for skill in skills_lst_1:
        skill_lst = []
        result = vector_index.similarity_search(skill, k=4)
        skill_lst = [val.page_content.split('\n')[1].split(':')[-1].strip(' ') for val in result]
        total_skills.extend(skill_lst)

    expanded_skill_lst = list(set(total_skills))
    return expanded_skill_lst

expanded_skill_lst_1 = expand_skill_lst(skills_lst_1)
expanded_skill_lst_2 = expand_skill_lst(skills_lst_2)

In [27]:
numerator_count = 0
denominator_count = 0
for skill_1 in expanded_skill_lst_1:
    if skill_1 in expanded_skill_lst_2:
        numerator_count += 1
        denominator_count += 1
        expanded_skill_lst_1.remove(skill_1)
        expanded_skill_lst_2.remove(skill_1)
        continue

print(numerator_count,' ',denominator_count)

17   17


## Using FAISS

In [42]:
import faiss
import numpy as np
from sklearn.preprocessing import normalize
from langchain.embeddings import OpenAIEmbeddings
import pickle 

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")

def get_openai_embeddings(texts):
    embeddings = embeddings_model.embed_documents(texts)
    return np.array(embeddings)


def build_faiss_index(index_list):
    index_vectors = get_openai_embeddings(index_list)
    
    # Normalize vectors to use cosine similarity
    index_vectors = normalize(index_vectors, norm='l2')
    
    dimension = index_vectors.shape[1]  
    index = faiss.IndexFlatIP(dimension)  # Using Inner Product (IP) for cosine similarity
    index.add(index_vectors)  # Add index vectors to the FAISS index
    
    return index

def save_index_to_memory(faiss_index):
    return faiss.serialize_index(faiss_index)

def load_index_from_memory(serialized_index):
    return faiss.deserialize_index(serialized_index)

def fetch_top_k_results(faiss_index, index_list, query, k=5):
    query_vector = get_openai_embeddings([query])[0].reshape(1, -1)  
    
    query_vector = normalize(query_vector, norm='l2')
    
    distances, top_indices = faiss_index.search(query_vector, k)  
    top_results = [(index_list[i], float(dist)) for i, dist in zip(top_indices[0], distances[0])]
    
    return top_results

index_list = skills_lst_1

faiss_index = build_faiss_index(index_list)
serialized_index = save_index_to_memory(faiss_index)

loaded_faiss_index = load_index_from_memory(serialized_index)

In [44]:
query = "Risk management skills"
k = 3  

results = fetch_top_k_results(loaded_faiss_index, index_list, query, k)
print(results)


[('Risk management', 0.8069169521331787), ('Portfolio management', 0.4653838574886322), ('Investment management', 0.4617082476615906)]
