In [29]:
import pandas as pd
import json
import os
from langchain_community.graphs import Neo4jGraph
from langchain_community.chat_models import ChatOllama
from langchain_community.llms import Ollama
from langchain.document_loaders import WikipediaLoader
from langchain.chains import LLMChain
from langchain.prompts.chat import (ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate)
from langchain import PromptTemplate
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.schema import (SystemMessage,HumanMessage,AIMessage)
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.chat_models import ChatOpenAI

In [2]:
# Neo4j 
NEO4J_URL = "neo4j://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "fireinthehole"
NEO4J_DATABASE = 'neo4j'

# https://api.python.langchain.com/en/latest/graphs/langchain_community.graphs.neo4j_graph.Neo4jGraph.html
graph = Neo4jGraph(url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

# How to Load Any Text?

1. Text Loader (.txt, .md)

In [3]:
from langchain.document_loaders import TextLoader
loader = TextLoader('raw_data/raw_summary.txt')
documents = loader.load()
print(documents)

[Document(metadata={'source': 'raw_data/raw_summary.txt'}, page_content='= FIRST ITERATION = \nTimothy D. Cook is the CEO of Apple Inc., who joined the company in 1998 and took over as CEO in 2009. \nUnder his leadership, Apple became the world\'s largest company by market capitalization and revenue, thanks to cost-saving measures such as long-term deals for flash memory that led to popular devices like the iPod Nano, iPhone, and iPad. \nApple was founded in 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, with the Macintosh computer, introduced in 1984, being a revolutionary graphical user interface-based system designed for the masses. \nThe Macintosh team, led by Jef Raskin and later Steve Jobs, faced challenges in bringing the revolutionary design to life but generated cult enthusiasm with new programs like PageMaker, MORE, and Excel. \nApple released improved versions of the Macintosh, like the Macintosh 512K, to address initial limitations. \nApple bought NeXT in 1997, bringi

2. PDF Loader

In [7]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("raw_data/2404.14047v1.pdf")
pages = loader.load_and_split()
print(pages)

[Document(metadata={'source': 'raw_data/2404.14047v1.pdf', 'page': 0}, page_content='How Good Are Low-bit Quantized LLAMA3 Models?\nAn Empirical Study\nWei Huang∗\nThe University of Hong Kong\nweih@connect.hku.hkXudong Ma∗\nBeihang University\nmacaronlin@buaa.edu.cn\nHaotong Qin†\nETH Zurich\nhaotong.qin@pbl.ee.ethz.chXingyu Zheng\nBeihang University\nxingyuzheng@buaa.edu.cn\nChengtao Lv\nBeihang University\nlvchengtao@buaa.edu.cnHong Chen\nBeihang University\n18373205@buaa.edu.cnJie Luo\nBeihang University\nluojie@buaa.edu.cn\nXiaojuan Qi\nThe University of Hong Kong\nxjqi@eee.hku.hkXianglong Liu\nBeihang University\nxlliu@buaa.edu.cnMichele Magno\nETH Zurich\nmichele.magno@pbl.ee.ethz.ch\nAbstract\nMeta’s LLAMA family has become one of the most powerful open-source Large\nLanguage Model (LLM) series. Notably, LLAMA3 models have recently been\nreleased and achieve impressive performance across various with super-large scale\npre-training on over 15T tokens of data. Given the wide appl

3. Website Loader

In [8]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://supertype.ai/notes/unveiling-youtube-insights-part-1/")
pages = loader.load()
print(pages)

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://supertype.ai/notes/unveiling-youtube-insights-part-1/', 'title': 'Unveiling YouTube Insights - Introduction, Data Collection, Data Processing, and Database (Part 1) • Supertype', 'description': 'In this post, we will develop a website that integrates sentiment analysis techniques and a Large Language Model to provide a comprehensive understanding of YouTube comments, enabling users to extract meaningful information effortlessly.', 'language': 'en-US'}, page_content='\n\n\n\n\n\n\n\n\nUnveiling YouTube Insights - Introduction, Data Collection, Data Processing, and Database (Part 1) • Supertype\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n \n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n \n \nSupertype\nProduct & Services\n\nPortfolio Computer Vision Custom BI Development Manag

4. Wikipedia Loader

In [15]:
from langchain.document_loaders import WikipediaLoader
query = "Sam Altman"
pages = WikipediaLoader(query=query, load_max_docs=2).load()
print(pages)



5. YouTube Transcript Loader

In [16]:
from langchain_community.document_loaders import YoutubeLoader
loader = YoutubeLoader.from_youtube_url("https://youtu.be/KMXQ4SVLwmo", add_video_info=False)
pages = loader.load()
print(pages)

[Document(metadata={'source': 'KMXQ4SVLwmo'}, page_content="hey wison yeah what is knowledge craft do we need it to enhance our LM performance oh and also do you know how to integrate it with L chain okay guys hold on take it easy I will explain to you in detail step by step stuff from the per and how to set up our Na 4y databas and then how to integrate it using L chain and of course all of that we will use an open source all app so without further ado let get started what is no squas Once Upon a Time way back in 1736 there was a sweet M named Leonard eer who faced a mindbending challenge the seven breach of kbur problem is there a way to walk across all bries ones starting and ending at the same place eer heis something more crucial what matter was how things were connected so you turn the city's lanmark into dots or nodes and its preaches into lives for ages creating a neat little Network known as the origin of the graph Theory story time is both for SP hold on instead of just buing

# Load & Summarize Data

In [19]:
query = "Tim Cook"
raw_documents = WikipediaLoader(query=query, load_max_docs=20).load()
# raw_documents



  lis = BeautifulSoup(html).find_all('li')


In [23]:
filtered_raw_documents = [raw_documents[i] for i in [0,1,4,7,8,9,10,12,13]] #0: Tim Cook (person), 1: Apple (company), 4: Mac (product), 10: Research, 11: Apple Maps, 13: App Store, 7: Apple TV, 8: Steve Jobs, 13: iPhone
docs = " ".join([d.page_content for d in filtered_raw_documents]).replace("\n", "").replace("==", "")
# print(docs)

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=30
)
split_docs = text_splitter.create_documents([docs])

In [30]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

# llm = Ollama(model="mistral") # Define the mistral model
llm = ChatOpenAI(model='gpt-3.5-turbo')

# Define the map prompt template
map_template = """The following is a set of documents
{all_data}
Based on this list of docs, please find the important information from it (focus on entities and relationship)
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)

# Define the map_chain
map_chain = LLMChain(llm=llm, prompt=map_prompt)

reduce_template = """The following is set of summaries:
{all_data}
Take these and distill it into a final, consolidated summary of the main themes. In one final paragraph
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name="all_data"  # This should match the variable name in reduce_prompt
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=1024,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="all_data",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)


# Run the MapReduce Chain
summarization_results = map_reduce_chain.run(split_docs)

  warn_deprecated(
  warn_deprecated(


In [31]:
file_path = "./clean_data/clean_summary.txt"

with open(file_path, 'a') as file:
    file.write(summarization_results)

# Extract Information

In [32]:
entity_types = ['person','school','award','company','product','characteristic']
relation_types = ['alumniOf','worksFor','hasAward','isProducedBy','hasCharacteristic','acquired','hasProject','isFounderOf']

system_prompt = PromptTemplate(
    template = """
    You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
    Your task is to identify the entities and relations requested with the user prompt, from a given text.
    You must generate the output in a JSON containing a list with JSON objects having the following keys: "head", "head_type", "relation", "tail", and "tail_type".
    The "head" key must contain the text of the extracted entity with one of the types from the provided list in the user prompt. 
    The "head_type" key must contain the type of the extracted head entity which must be one of the types from {entity_types}.
    The "relation" key must contain the type of relation between the "head" and the "tail" which must be one of the relations from {relation_types}.
    The "tail" key must represent the text of an extracted entity which is the tail of the relation, and the "tail_type" key must contain the type of the tail entity from {entity_types}. 
    Attempt to extract as many entities and relations as you can. 
    
    IMPORTANT NOTES:
    - Don't add any explanation and text. 
    """,
    input_variables=["entity_types","relation_types"],
)


system_message_prompt = SystemMessagePromptTemplate(prompt = system_prompt)

examples = [
        {
            "text":"Adam is a software engineer in Microsoft since 2009, and last year he got an award as the Best Talent" ,    
            "head": "Adam",
            "head_type": "person",
            "relation": "worksFor",
            "tail": "Microsoft",
            "tail_type": "company"
        },
        {
            "text":"Adam is a software engineer in Microsoft since 2009, and last year he got an award as the Best Talent" ,    
            "head": "Adam",
            "head_type": "person",
            "relation": "hasAward",
            "tail": "Best Talent",
            "tail_type": "award"
        },
        {
            "text":"Microsoft is a tech company that provide several products such as Microsoft Word" ,    
            "head": "Microsoft Word",
            "head_type": "product",
            "relation": "isproducedBy",
            "tail": "Microsoft",
            "tail_type": "company"
        },
        {
            "text":"Microsoft Word is a lightweight app that accessible offline" ,    
            "head": "Microsoft Word",
            "head_type": "product",
            "relation": "hasCharacteristic",
            "tail": "lightweight app",
            "tail_type": "characteristic"
        },
        {
            "text":"Microsoft Word is a lightweight app that accessible offline" ,    
            "head": "Microsoft Word",
            "head_type": "product",
            "relation": "hasCharacteristic",
            "tail": "accesible offline",
            "tail_type": "characteristic"
        },
    ]

class ExtractedInfo(BaseModel):
    head: str = Field(description="extracted first or head entity like Microsoft, Apple, John")
    head_type: str = Field(description="type of the extracted head entity like person, company, etc")
    relation: str = Field(description="relation between the head and the tail entities")
    tail: str = Field(description="extracted second or tail entity like Microsoft, Apple, John")
    tail_type: str = Field(description="type of the extracted tail entity like person, company, etc")
    
parser = JsonOutputParser(pydantic_object=ExtractedInfo)

human_prompt = PromptTemplate(
    template = """ Based on the following example, extract entities and relations from the provided text.\n\n

    Use the following entity types, don't use other entity that is not defined below:
    # ENTITY TYPES:
    {entity_types}

    Use the following relation types, don't use other relation that is not defined below:
    # RELATION TYPES:
    {relation_types}

    Below are a number of examples of text and their extracted entities and relationshhips.
    {examples}

    For the following text, generate extract entitites and relations as in the provided example.\n{format_instructions}\nText: {text}""",
    input_variables=["entity_types","relation_types","examples","text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

human_message_prompt = HumanMessagePromptTemplate(prompt=human_prompt)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

# model = ChatOllama(model = "mistral",temperature=0)
# model = ChatOllama(model = "llama3",temperature=0)
model = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
chain = LLMChain(llm=model, prompt=chat_prompt)

In [33]:
parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"head": {"title": "Head", "description": "extracted first or head entity like Microsoft, Apple, John", "type": "string"}, "head_type": {"title": "Head Type", "description": "type of the extracted head entity like person, company, etc", "type": "string"}, "relation": {"title": "Relation", "description": "relation between the head and the tail entities", "type": "string"}, "tail": {"title": "Tail", "description": "extracted second or tail entity like Microsoft, Apple, John", "type": "string"}, "tail_type": {"title": "Tail

In [34]:
file_path = "./clean_data/clean_summary.txt"
with open(file_path, 'r') as file:
    # Read the entire file contents into a string
    file_contents = file.read()

# Split the file contents into sentences
sentences = file_contents.split('. ')

result = []
# Iterate over each sentence
for sentence in sentences:
    # Process each sentence
    response = chain.run(entity_types = entity_types, relation_types = relation_types, examples = examples, text = sentence)
    print(response)
    try:
        result.extend(eval(response))
    except:
        pass

[
    {
        "head": "Tim Cook",
        "head_type": "person",
        "relation": "worksFor",
        "tail": "Apple Inc.",
        "tail_type": "company"
    },
    {
        "head": "Steve Jobs",
        "head_type": "person",
        "relation": "isFounderOf",
        "tail": "Apple Inc.",
        "tail_type": "company"
    },
    {
        "head": "Scott Forstall",
        "head_type": "person",
        "relation": "worksFor",
        "tail": "Apple Inc.",
        "tail_type": "company"
    }
]
[
    {
        "head": "Tim Cook",
        "head_type": "person",
        "relation": "isFounderOf",
        "tail": "Apple",
        "tail_type": "company"
    },
    {
        "head": "Tim Cook",
        "head_type": "person",
        "relation": "worksFor",
        "tail": "Apple",
        "tail_type": "company"
    },
    {
        "head": "Tim Cook",
        "head_type": "person",
        "relation": "hasAward",
        "tail": "Canadian military historian",
        "tail_type": "

In [38]:
with open("clean_data/clean_result.txt", "w") as f:
    json.dump(result, f)

# Convert to Cypher Query

In [39]:
with open("./clean_data/clean_result.txt", "r") as file:
    content = file.read()
entity_relations = eval(content)
print(entity_relations)

[{'head': 'Tim Cook', 'head_type': 'person', 'relation': 'worksFor', 'tail': 'Apple Inc.', 'tail_type': 'company'}, {'head': 'Steve Jobs', 'head_type': 'person', 'relation': 'isFounderOf', 'tail': 'Apple Inc.', 'tail_type': 'company'}, {'head': 'Scott Forstall', 'head_type': 'person', 'relation': 'worksFor', 'tail': 'Apple Inc.', 'tail_type': 'company'}, {'head': 'Tim Cook', 'head_type': 'person', 'relation': 'isFounderOf', 'tail': 'Apple', 'tail_type': 'company'}, {'head': 'Tim Cook', 'head_type': 'person', 'relation': 'worksFor', 'tail': 'Apple', 'tail_type': 'company'}, {'head': 'Tim Cook', 'head_type': 'person', 'relation': 'hasAward', 'tail': 'Canadian military historian', 'tail_type': 'award'}, {'head': 'Tim Cook', 'head_type': 'person', 'relation': 'hasCharacteristic', 'tail': 'leadership', 'tail_type': 'characteristic'}, {'head': 'Tim Cook', 'head_type': 'person', 'relation': 'hasCharacteristic', 'tail': 'background in operations', 'tail_type': 'characteristic'}, {'head': 'Tim 

In [40]:
df = pd.DataFrame(entity_relations)
df

Unnamed: 0,head,head_type,relation,tail,tail_type
0,Tim Cook,person,worksFor,Apple Inc.,company
1,Steve Jobs,person,isFounderOf,Apple Inc.,company
2,Scott Forstall,person,worksFor,Apple Inc.,company
3,Tim Cook,person,isFounderOf,Apple,company
4,Tim Cook,person,worksFor,Apple,company
5,Tim Cook,person,hasAward,Canadian military historian,award
6,Tim Cook,person,hasCharacteristic,leadership,characteristic
7,Tim Cook,person,hasCharacteristic,background in operations,characteristic
8,Tim Cook,person,hasCharacteristic,accolades,characteristic
9,Tim Cook,person,hasProject,diversity and treatment of women in the workplace,project


In [81]:
unique_entities = set()
for item in entity_relations:
    unique_entities.add((item['head'], item['head_type']))
    unique_entities.add((item['tail'], item['tail_type']))

unique_entities_list = list(unique_entities)
print(unique_entities_list)

[('Steve Jobs', 'person'), ('background in operations', 'characteristic'), ('transition to new processors', 'project'), ('Tim Cook', 'person'), ('leadership', 'characteristic'), ('Apple Inc.', 'company'), ('Apple Maps', 'product'), ('diversity and treatment of women in the workplace', 'project'), ('mapping services', 'product'), ('ongoing efforts to enhance user experience and innovation', 'characteristic'), ('Canadian military historian', 'award'), ('accolades', 'characteristic'), ('development of iconic devices', 'project'), ('Apple', 'company'), ('technological advancements', 'characteristic'), ('improve mapping services', 'product'), ('Scott Forstall', 'person')]


In [82]:
with open("cypher_query.txt", "a") as file:
    for item in unique_entities_list:
        label, entity = item
        id = label.replace(" ","_").replace("-","").replace("'","").lower()
        merge_statement = f"""MERGE ({id}:{entity} {{id: "{label}"}})\n"""
        file.write(merge_statement)

In [83]:
with open("cypher_query.txt", "a") as file:
    for item in entity_relations:
        head = item['head'].replace(" ","_").replace("-","").replace("'","").lower()
        tail = item['tail'].replace(" ","_").replace("-","").replace("'","").lower()
        cypher = f"""MERGE ({head})-[:{item['relation']}]->({tail})\n"""
        file.write(cypher)

In [84]:
NEO4J_URL = "neo4j://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "fireinthehole"
NEO4J_DATABASE = 'neo4j'

graph = Neo4jGraph(url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

In [85]:
# Delete everything in a database
cypher = """
MATCH (n)
DETACH DELETE n
"""
graph.query(cypher)

graph.refresh_schema()
print(graph.schema)

Node properties:

Relationship properties:

The relationships:



In [86]:
with open("cypher_query.txt", "r") as file:
    queries = file.read()
graph.query(queries)

ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input '.': expected a graph pattern, a parameter, ')', ':', 'IS', 'WHERE' or '{' (line 6, column 17 (offset: 318))
"MERGE (apple_inc.:company {id: "Apple Inc."})"
                 ^}

In [87]:
# Replace string to fix
# Remove dot character
!sed -i 's/apple_inc./apple_inc/g' cypher_query.txt

In [88]:
with open("cypher_query.txt", "r") as file:
    queries = file.read()
graph.query(queries)

[]

In [89]:
graph.refresh_schema()
print(graph.schema)

Node properties:
person {id: STRING}
characteristic {id: STRING}
project {id: STRING}
company {id: STRING}
product {id: STRING}
award {id: STRING}
Relationship properties:

The relationships:
(:person)-[:isFounderOf]->(:company)
(:person)-[:hasProject]->(:project)
(:person)-[:hasCharacteristic]->(:characteristic)
(:person)-[:worksFor]->(:company)
(:person)-[:hasAward]->(:award)
(:company)-[:hasCharacteristic]->(:characteristic)
(:company)-[:hasProject]->(:characteristic)
(:company)-[:hasProject]->(:product)
(:company)-[:hasProject]->(:project)
(:product)-[:isProducedBy]->(:company)
(:product)-[:acquired]->(:product)
(:product)-[:acquired]->(:characteristic)
