In [None]:
from langchain_community.graphs import Neo4jGraph
import os
import dotenv
dotenv.load_dotenv()

neo4j_graph = Neo4jGraph(url=os.getenv('NEO4J_URI'), username=os.getenv('NEO4J_NAME'), password=os.getenv('NEO4J_PASSWORD'))

In [None]:
from database.ChromaDB import ChromaDB
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

In [None]:
model_name = 'BAAI/bge-small-en-v1.5'
model = SentenceTransformerEmbeddings(model_name=model_name)
directory = 'temp'

db_cache_city = ChromaDB(data_path = f'{directory}/city.db', model=model)
db_cache_role = ChromaDB(data_path = f'{directory}/role.db', model=model)
db_cache_language = ChromaDB(data_path = f'{directory}/language.db', model=model)
db_cache_institution = ChromaDB(data_path = f'{directory}/institution.db', model=model)
db_cache_education = ChromaDB(data_path = f'{directory}/education.db', model=model)
db_cache_major = ChromaDB(data_path = f'{directory}/major.db', model=model)
db_cache_skills = ChromaDB(data_path = f'{directory}/skills.db', model=model)
db_cache_programming = ChromaDB(data_path = f'{directory}/programming.db', model=model)

In [None]:
class UnionFind:
    def __init__(self, num_nodes):
        self.parent = [i for i in range(num_nodes)]
        self.rank = [0 for i in range(num_nodes)]
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]
    def union(self, x, y):
        root_x = self.find(x)
        root_y = self.find(y)
        if root_x != root_y:
            if self.rank[root_x] > self.rank[root_y]:
                self.parent[root_y] = root_x
            else:
                self.parent[root_x] = root_y
                if self.rank[root_x] == self.rank[root_y]:
                    self.rank[root_y] += 1
    def add_node(self, x, y):
        self.union(x, y)
        
    def get_clusters(self):
        clusters = {}
        for i in range(len(self.parent)):
            root = self.find(i)
            if root not in clusters:
                clusters[root] = []
            clusters[root].append(i)
        return clusters
        
        

In [None]:
cities = neo4j_graph.query(
    """
    MATCH (c:City)
    RETURN c.name as name
    """
)
cities = [city["name"] for city in cities]

In [None]:
cities

In [None]:
languages = neo4j_graph.query(
    """
    MATCH (c:Language)
    RETURN c.name as name
    """
)
languages = [language["name"] for language in languages]

In [None]:
institution = neo4j_graph.query(
    """
    MATCH (c:Institution)
    RETURN distinct(c.name) as name
    """
)
institution = [ins["name"] for ins in institution]

In [None]:
len(institution)

In [None]:
skills = neo4j_graph.query(
    """
    MATCH (c:Skill)
    RETURN Distinct(c.name) as name
    """
)
skills = [skill["name"] for skill in skills]

In [None]:
len(skills)

In [None]:
education = neo4j_graph.query(
    """
    MATCH (c:Academic)
    RETURN c.name as name
    """
)
education = [edu["name"] for edu in education]

In [None]:
programming = neo4j_graph.query(
    """
    MATCH (p:ProgrammingLanguage)
    RETURN p.name as name
    """
)

programming = [p["name"] for p in programming]

In [None]:
len(programming)

In [None]:
roles = neo4j_graph.query(
    """
    MATCH (p:Role)
    RETURN p.name as name
    """
)

roles = [p["name"] for p in roles]

In [None]:
len(roles)

In [None]:
majors = neo4j_graph.query(
    """
    MATCH (p:Major)
    RETURN p.name as name
    """
)

majors = [p["name"] for p in majors]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_similarity(texts, db_cache, threshold = 0.9):
    text_id = dict()
    similarity = dict()
    i = 0
    for text in texts:
        text_id[text] = i
        i += 1
    score = np.zeros((len(texts), len(texts)))
    for i in range(len(texts)):
        text = texts[i]
        result = db_cache.similarity_search_with_relevance_scores(text, len(texts))
        for item in result:
            score[i , text_id[item[0].page_content]] = item[1]
            if i != text_id[item[0].page_content]:
                if item[1] > threshold:
                    if text not in similarity:
                        similarity[text] = []
                    similarity[text].append(item[0].page_content)
    
    # sns.heatmap(score, annot=True, cmap="viridis", cbar=True)
    # plt.xticks(np.arange(len(texts)), texts, rotation=90)
    # plt.yticks(np.arange(len(texts)), texts, rotation=0)
    # plt.show()
    return score, similarity
    

In [None]:
def cluster_changes(role_similar):
    role_id = dict()
    node_role = dict()
    id = 0
    items = []
    for k, v in role_similar.items():
        if k not in role_id:
            role_id[k] = id
            items.append(k)
            id += 1
        for item in v:
            if item not in role_id:
                role_id[item] = id
                items.append(item)
                id += 1
                
        node_role[role_id[k]] = [role_id[item] for item in v]
        
    uf = UnionFind(len(role_id))
    for k, v in node_role.items():
        for item in v:
            uf.add_node(k, item)

    cluster = uf.get_clusters()
    cluster_text = dict()
    for k, v in cluster.items():
        cluster_text[items[k]] = [items[item] for item in v]
    return cluster_text


In [None]:
city_score, city_similar = analyze_similarity(cities, db_cache_city, 0.8)

In [None]:
city_change = cluster_changes(city_similar)
city_change

In [None]:
education_score, education_similarity = analyze_similarity(education, db_cache_education)

In [None]:
education_similarity

In [None]:
role_score, role_similar = analyze_similarity(roles, db_cache_role, 0.86)

In [None]:
role_change = cluster_changes(role_similar)
role_change



In [None]:
programming_score, programming_similar = analyze_similarity(programming, db_cache_programming,0.80)

In [None]:
change_programming = cluster_changes(programming_similar)

In [None]:
change_programming

In [None]:
languages_score, languages_similar = analyze_similarity(languages, db_cache_language, 0.9)

In [None]:
institution_score, institution_similar = analyze_similarity(institution, db_cache_institution, 0.92)

In [None]:
institution_similar

In [None]:
institution_similar = {'fpt university ho chi minh city': ['fpt university ho chi minh'],
 'ho chi minh city open university': ['open university ho chi minh city'],
 'national economics university': ['national economic university',
  'national economics university (neu)'],
 'national economic university': ['national economics university'],
 'fpt polytechnic': ['fpt polytechnic college'],
 'finance academy': ['academy of finance'],
 'ho chi minh city university of technology': [
  'ho chi minh university of technology'],
 'fpt polytechnic college': ['fpt polytechnic', 'polytechnic fpt colleges'],
 'university of information technology - vietnam national university': ['university of information technology - vietnam national university ho chi minh city'],
 'national economics university (neu)': ['national economics university'],
 'fpt university ho chi minh': ['fpt university ho chi minh city'],
 'polytechnic fpt colleges': ['fpt polytechnic college'],
 'university of information technology - vietnam national university ho chi minh city': ['university of information technology - vietnam national university'],
 'ho chi minh university of technology': ['ho chi minh city university of technology'],
 'academy of finance': ['finance academy'],
 'open university ho chi minh city': ['ho chi minh city open university']}

In [None]:
change_institution = cluster_changes(institution_similar)

In [None]:
change_institution

In [None]:
majors_score, majors_similar = analyze_similarity(majors, db_cache_major, 0.83)

In [None]:
change_major = cluster_changes(majors_similar)

In [None]:
change_major

In [None]:
skills_score, skills_similar = analyze_similarity(skills, db_cache_skills, 0.81)

In [None]:
change_skills = cluster_changes(skills_similar)

In [None]:
change_skills

In [None]:
query = """
        MATCH (a: ProgrammingLanguage {name: $form.node_val1}), (b: ProgrammingLanguage {name: $form.node_val2})

            MATCH (other)-[r:PROGRAMMING]->(b)
            MERGE (other)-[r2:PROGRAMMING]->(a)
            SET r2 = r
        
        DETACH delete b
        return r2.exp
        """
# neo4j_graph.query(query)

In [None]:
# form = {
#     # "node_type": "Award",
#     "node_val1": "spring boot",
#     "node_val2": "java spring boot"
#     # "relationship": "AWARD"
# }
# neo4j_graph.query(query, params={"form": form})

In [None]:
change_institution

In [None]:
i = 0
for keep, changes in change_institution.items():
    for change in changes:
        
        query = """
        MATCH (a: Institution  {name: $form.node_val1}), (b: Institution  {name: $form.node_val2})

            MATCH (other)-[r:STUDY]->(b)
            MERGE (other)-[r2:STUDY]->(a)
            SET r2 = r
            DELETE r
        
        DETACH delete b
        return a
        """
        
        if keep != change:
            form = {

                "node_val1": keep,
                "node_val2": change,

            }
            # print(keep, change)
            result = neo4j_graph.query(query, params={"form": form})
            print(result)
            i+=1

In [None]:
i = 0
for keep, changes in change_programming.items():
    for change in changes:
        
        query = """
        MATCH (a: ProgrammingLanguage {name: $form.node_val1}), (b: ProgrammingLanguage {name: $form.node_val2})

            MATCH (other)-[r:PROGRAMMING]->(b)
            MERGE (other)-[r2:PROGRAMMING]->(a)
            SET r2 = r
            DELETE r
        
        DETACH delete b
        return a.name, r2.exp
        """

            
        
        if keep != change:
            form = {

                "node_val1": keep,
                "node_val2": change,

            }
            result = neo4j_graph.query(query, params={"form": form})
            print(result)
            i+=1

In [None]:
i = 0
for keep, changes in change_major.items():
    for change in changes:
        
        query = """
        MATCH (a: Major {name: $form.node_val1}), (b: Major {name: $form.node_val2})

            MATCH (other)-[r:MAJOR ]->(b)
            MERGE (other)-[r2:MAJOR ]->(a)
            SET r2 = r
            DELETE r
        
        DETACH delete b
        return r2.level
        """

            
        
        if keep != change:
            form = {

                "node_val1": keep,
                "node_val2": change,

            }
            result = neo4j_graph.query(query, params={"form": form})
            print(result)
            i+=1

In [None]:
i = 0
for keep, changes in change_skills.items():
    for change in changes:
        
        query = """
        MATCH (a: Skill {name: $form.node_val1}), (b: Skill {name: $form.node_val2})

            MATCH (other)-[r:SKILL ]->(b)
            MERGE (other)-[r2:SKILL ]->(a)
            SET r2 = r
            DELETE r
        
        DETACH delete b
        return a
        """

            
        
        if keep != change:
            form = {

                "node_val1": keep,
                "node_val2": change,

            }
            result = neo4j_graph.query(query, params={"form": form})
            print(result)
            i+=1

In [None]:
i = 0
for keep, changes in role_change.items():
    for change in changes:
        
        query = """
        MATCH (a: Role {name: $form.node_val1}), (b: Role {name: $form.node_val2})
        With a,b
            OPTIONAL MATCH (other)-[r:ROLE]->(b)
            FOREACH (r in CASE WHEN other IS NULL THEN [] ELSE [r] END |
                MERGE (other)-[r2:ROLE]->(a)
                SET r2 = r
                DELETE r
            )
            
        
        With a,b
            OPTIONAL MATCH (other2)-[r3:SUITABLE]->(b)
            FOREACH (r3 in CASE WHEN other2 IS NULL THEN [] ELSE [r3] END |
                MERGE (other2)-[r4:SUITABLE]->(a)
                SET r4 = r3
                DELETE r3
            )
        
        DETACH delete b
        return a
        """


        if keep != change:
            form = {

                "node_val1": keep,
                "node_val2": change,

            }
            result = neo4j_graph.query(query, params={"form": form})
            print(result)
            i+=1

In [None]:
['national economics university',
  'national economic university',
  'national economics university (neu)']

In [None]:
query = """
MATCH (n:Application )-[r:STUDY]-> (a:Institution  {name: "national economic university"})
return id(n)
"""
neo4j_graph.query(query)

In [None]:
query = """
MATCH (n:Application)-[r:PROGRAMMING]-> (a:ProgrammingLanguage {name: "spring boot"})
return id(n), r.exp
"""
neo4j_graph.query(query)

In [None]:
query = """
MATCH  (n:Application)-[r:PROGRAMMING]->(a:ProgrammingLanguage {name: "java spring boot"})
return id(n), r.exp

"""
neo4j_graph.query(query)

In [None]:
certifications = neo4j_graph.query(
    """
    MATCH (p:Certification)
    RETURN p.name as name
    """
)

certifications = [p["name"] for p in certifications]

awards = neo4j_graph.query(
    """
    MATCH (p:Award)
    RETURN p.name as name
    """
)

awards = [p["name"] for p in awards]

In [None]:
awards

In [None]:
db_cache_certifications = ChromaDB(data_path = f'{directory}/certifications.db', model=model)
db_cache_awards = ChromaDB(data_path = f'{directory}/awards.db', model=model)

In [None]:
neo4j_graph.refresh_schema()
print(neo4j_graph.schema)

In [None]:
# db_cache_certifications.add_texts(certifications)
# db_cache_awards.add_texts(awards)

In [None]:
awards_score, awards_similar = analyze_similarity(awards, db_cache_awards, 0.9)

In [None]:
awards_similar

In [None]:
certifications_score, certifications_similar = analyze_similarity(certifications, db_cache_certifications, 0.9)

In [None]:
certifications_similar