# Diffbot Graph Transformer from Langchain

In [4]:
# import the modifed library of Diffbot Graph Transformer
%run modified_diffbot_graph_transformer.ipynb

In [2]:
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
import os

diffbot_api_key = "Diffbot-token"
os.environ["DIFFBOT_API_KEY"] = diffbot_api_key

diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)

In [7]:
import pandas as pd
speaker_data = pd.read_csv('diffbot_txt_data.csv')

In [40]:
speaker_data.head()

Unnamed: 0,summary,image,link,title
0,With eight patents issued and receiving awards...,pioneering IBM’s early advances in Enterprise ...,https://datadaytexas.com/sites/default/files/i...,Plenary Keynote
1,". Mikiko has worked as an engineer, data scien...",https://datadaytexas.com/sites/default/files/i...,https://datadaytexas.com/2024/sessions#bazeley,MLOps Keynote
2,Susan Shu Chang (Linkedin) is currently Princi...,https://datadaytexas.com/sites/default/files/i...,https://datadaytexas.com/2024/sessions#chang,Machine Learning Keynote
3,"Jessica Talisman is a taxonomist, ontologist, ...",https://datadaytexas.com/sites/default/files/i...,https://datadaytexas.com/2024/sessions#talisman,Data Architecture Keynote
4,"One of our perennially requested speakers, \nJ...",https://datadaytexas.com/sites/default/files/i...,https://datadaytexas.com/2024/sessions#anderson,Data Engineering Keynote


In [8]:
# text bio of Dr. Clair Sullivan
speaker_data['summary'][5]

"is currently the Founder and CEO of Clair Sullivan and Associates, a company dedicated to providing data science consulting services. Prior to starting her company, she was the Director of Data Science at Vail Resorts leading a team of data scientists and machine learning engineers providing production models for operations and marketing. Previously she was a data science advocate at Neo4j, working to expand the community of data scientists and machine learning engineers using graphs to solve challenging problems. She received her doctorate degree in nuclear engineering from the University of Michigan in 2002. After that, she began her career in nuclear emergency response at Los Alamos National Laboratory where her research involved signal processing of spectroscopic data. She spent 4 years working in the federal government on related subjects and returned to academic research in 2012 as an assistant professor in the Department of Nuclear, Plasma, and Radiological Engineering at the U

In [5]:
text = 'While recent language models have the abil- ity to take long contexts as input, relatively little is known about how well they use longer context. We analyze the performance of language models on two tasks that require identifying relevant information in their in- put contexts: multi-document question an- swering and key-value retrieval. We find that performance can degrade significantly when changing the position of relevant informa- tion, indicating that current language models do not robustly make use of information in long input contexts. In particular, we observe that performance is often highest when rele- vant information occurs at the beginning or end of the input context, and significantly degrades when models must access relevant information in the middle of long contexts, even for explicitly long-context models. Our analysis provides a better understanding of how language models use their input context and provides new evaluation protocols for future long-context language models.'
result = diffbot_nlp.nlp_request(text)

In [10]:
type(result)

dict

In [7]:
result

{'facts': []}

In [15]:
diffbot_nlp = DiffbotGraphTransformer()
speaker = diffbot_nlp.process_response(result)

In [16]:
speaker

GraphDocument(nodes=[Node(id='Sullivan', type='Person', properties={'name': 'Sullivan', 'academicDegree': 'doctorate', 'positionHeld': 'machine learning engineer'}), Node(id='University of Illinois at Urbana-Champaign', type='Organization', properties={'name': 'University of Illinois at Urbana-Champaign'}), Node(id='La Neige Analytics', type='Organization', properties={'name': 'La Neige Analytics'}), Node(id='ski industry', type='Skill', properties={'name': 'ski industry'}), Node(id='http://www.wikidata.org/entity/Q83504', type='Skill', properties={'name': 'nuclear engineering'}), Node(id='DARPA Young Faculty Award', type='Award', properties={'name': 'DARPA Young Faculty Award'}), Node(id='Department of Nuclear, Plasma, and Radiological Engineering', type='Organization', properties={'name': 'Department of Nuclear, Plasma, and Radiological Engineering'}), Node(id='Clair Sullivan and Associates', type='Organization', properties={'name': 'Clair Sullivan and Associates'}), Node(id='data sc

In [18]:
speaker.nodes

[Node(id='Sullivan', type='Person', properties={'name': 'Sullivan', 'academicDegree': 'doctorate', 'positionHeld': 'machine learning engineer'}),
 Node(id='University of Illinois at Urbana-Champaign', type='Organization', properties={'name': 'University of Illinois at Urbana-Champaign'}),
 Node(id='La Neige Analytics', type='Organization', properties={'name': 'La Neige Analytics'}),
 Node(id='ski industry', type='Skill', properties={'name': 'ski industry'}),
 Node(id='http://www.wikidata.org/entity/Q83504', type='Skill', properties={'name': 'nuclear engineering'}),
 Node(id='DARPA Young Faculty Award', type='Award', properties={'name': 'DARPA Young Faculty Award'}),
 Node(id='Department of Nuclear, Plasma, and Radiological Engineering', type='Organization', properties={'name': 'Department of Nuclear, Plasma, and Radiological Engineering'}),
 Node(id='Clair Sullivan and Associates', type='Organization', properties={'name': 'Clair Sullivan and Associates'}),
 Node(id='data science consul

In [19]:
speaker.relationships

[Relationship(source=Node(id='Sullivan', type='Person'), target=Node(id='University of Illinois at Urbana-Champaign', type='Organization'), type='EMPLOYEE_OR_MEMBER_OF', properties={'evidence': 'She spent 4 years working in the federal government on related subjects and returned to academic research in 2012 as an assistant professor in the Department of Nuclear, Plasma, and Radiological Engineering at the University of Illinois at Urbana-Champaign.', 'isNotCurrent': 'true'}),
 Relationship(source=Node(id='La Neige Analytics', type='Organization'), target=Node(id='ski industry', type='Skill'), type='INDUSTRY', properties={'evidence': 'Additionally, she founded a company, La Neige Analytics, whose purpose is to provide data science expertise to the ski industry.'}),
 Relationship(source=Node(id='Sullivan', type='Person'), target=Node(id='http://www.wikidata.org/entity/Q83504', type='Skill'), type='FIELD_OF_WORK', properties={'evidence': 'She received her doctorate degree in nuclear engin

# Load the Knowledge Graph Constructed with Diffbot into Neo4j Graph Database

In [14]:
from langchain_community.graphs import Neo4jGraph

url = "url"
username = "neo4j"
password = "your-instance-password"

graph = Neo4jGraph(url=url, username=username, password=password)


In [61]:
diffbot_nlp = DiffbotGraphTransformer()
for i in range(len(speaker_data['summary'])):
    result = diffbot_nlp.nlp_request(speaker_data['summary'][i])
    speaker = diffbot_nlp.process_response(result)
    graph.add_graph_documents(speaker)
    print(i) # ensure no index is skipped
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
