In [2]:
# ! pip install openai
# ! pip install google
# ! pip install llama-index-readers-web
# ! pip install llama-index-llms-openai
# ! pip install llama-index-program-openai
# ! pip install llama-index-llms-llama-api
# ! pip install llama-index-embeddings-openai

Collecting openai
  Downloading openai-1.35.7-py3-none-any.whl (327 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.5/327.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [1]:
import os
from openai import OpenAI as open_ai
import re
import numpy as np
from googlesearch import search
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core import SummaryIndex, Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.llms.llama_api import LlamaAPI
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.embeddings.openai import OpenAIEmbedding

In [2]:
os.environ["OPENAI_API_KEY"] = 'apikey'
client = open_ai()

In [3]:
setup_string = """
Your job is to return Google search queries, specifically Google Dorks, to find more
about a subject given a dumped text file of all of the information we currently know
about the subject. Not all of this information should be used in every query.
You can output up to twenty queries, and they should be formatted in a bullet point list
format, where every bullet point contains only the Google Dork search query and nothing
else (without quotes).

The end goal is to create an OSINT (open source intelligence) report about
the given subject. For example, if the user wants to find more about a person,
you would help them find out about their location, age, education, associated
institutions, relevant connections, and so forth.
""" # add osint flowchart

setup_string_two = """
The end goal is to create an OSINT (open source intelligence) report about
the given subject. For example, if the user wants to find more about a person,
you would help them find out about their location, age, education, associated
institutions, relevant connections, and so forth.
""" # add osint flowchart

In [4]:
# prompt = """
# I want to find out more about the person Jai Sharma using Google Dorks. Here are
# some things I know about him:
# - Went to Monta Vista High School
# - Is friends with Milind Maiti
# - Might be associated with Berkeley
# - Research oriented and possibly interested in Machine Learning, wrote a paper with Christopher Sun
# """
prompt = """
I want to find out more about the person Ilya Sutskever. He is related to OpenAI and is a Machine Learning researcher.
"""

In [5]:
Settings.llm = OpenAI(api_key=os.environ["OPENAI_API_KEY"], model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(api_key=os.environ["OPENAI_API_KEY"], model="text-embedding-3-large", embed_batch_size=100)

In [6]:
def iteration(prompt):
  # Find dorks
  completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {"role": "system", "content": setup_string},
      {"role": "user", "content": prompt}
    ],
  )

  result = completion.choices[0].message.content
  search_list = result.split("\n")
  search_list = [i[2:] for i in search_list]
  print(search_list)

  urls = []
  for query in search_list:
    for j in search(query, tld="co.in", num=5, stop=5, pause=2):
        urls.append(j)
        print(j)

  np.random.seed(0)
  url_shortlist = urls
  documents = SimpleWebPageReader().load_data(url_shortlist)
  print(len(documents))
  for i in range(len(documents)):
    documents[i].text = re.sub(r'\\s+', ' ', documents[i].text)
    documents[i].text = re.sub(r' +', ' ', documents[i].text)
    documents[i].text = re.sub(r'\\n', '\n', documents[i].text)
    documents[i].text = re.sub(r'\n +', '\n', documents[i].text)
    documents[i].text = re.sub(r'\n+', '\n', documents[i].text)

  parser = HTMLNodeParser(
      tags=["p"],
      chunk_lines=100,
      chunk_lines_overlap=50,
      )

  min_node_len = 0
  nodes = parser.get_nodes_from_documents(documents)
  nodes = [node for node in nodes if len(node.text) > min_node_len]

  for node in nodes:
    print(node.text)
    print(node.metadata)

  index = VectorStoreIndex(nodes=nodes)

  retriever = VectorIndexRetriever(
      index=index,
      similarity_top_k=10,
  )

  relevant_nodes = retriever.retrieve("System Role: " + setup_string_two + "\n\n\n" + "User Query: " + prompt)
  print("THE RELEVANT NODES: ")
  added_information = "\n\n\n".join([i.text for i in relevant_nodes])

  result = prompt + added_information

  return result, nodes

In [7]:
result, nodes = iteration(prompt)

['"Ilya Sutskever" site:linkedin.com', '"Ilya Sutskever" site:twitter.com', '"Ilya Sutskever" site:facebook.com', '"Ilya Sutskever" site:researchgate.net', '"Ilya Sutskever" site:scholar.google.com', '"Ilya Sutskever" site:openai.com', '"Ilya Sutskever" site:medium.com', '"Ilya Sutskever" site:github.com', '"Ilya Sutskever" site:arxiv.org', '"Ilya Sutskever" AND OpenAI', '"Ilya Sutskever" AND "machine learning"', '"Ilya Sutskever" AND "deep learning"', '"Ilya Sutskever" AND "artificial intelligence"', '"Ilya Sutskever" AND "Neural Networks"', '"Ilya Sutskever" AND "PhD"', '"Ilya Sutskever" AND "education"', '"Ilya Sutskever" AND "biography"', '"Ilya Sutskever" AND "contact information"', '"Ilya Sutskever" AND "Google Scholar"', '"Ilya Sutskever" AND "publications"']
https://www.linkedin.com/in/ilya-sutskever
https://www.linkedin.com/posts/ristouuk_so-ilya-sutskever-a-co-founder-and-former-activity-7212032000373702657-6uz2
https://www.linkedin.com/posts/shellypalmer_ilya-sutskever-opena

In [8]:
print(result)


I want to find out more about the person Ilya Sutskever. He is related to OpenAI and is a Machine Learning researcher.
SarkariExam.com

News: In the dynamic realm of artificial intelligence (AI), Ilya Sutskever, Co-founder, and Chief Scientist at OpenAI, has garnered attention for his remarkable journey. His visionary outlook, diverse linguistic background, and substantial contributions have propelled him into the spotlight within the AI community.
Ilya Sutskever, born in 1985 in Soviet Russia, spent his early years in Israel before eventually making Canada his home. Although details about his personal life remain relatively scarce, his professional achievements have been the focus of considerable attention. Sutskever is a distinguished computer scientist specializing in machine learning, and he holds citizenships in Russia, Israel, and Canada.
Sutskever’s educational journey commenced at the Open University of Israel, where he pursued studies from 2000 to 2002. Subsequently, he reloc

In [9]:
def generate(prompt, sinput):
  MODEL = "gpt-4o"
  client = open_ai()

  completion = client.chat.completions.create(
      model=MODEL,
      messages = [
          {"role": "system", "content": sinput},
          {"role": "user", "content": prompt}
      ],
      temperature = 0.0
  )

  return completion.choices[0].message.content

In [10]:
system_input = '''You are an expert at finding connections between people. The input will be a paragraph from an online article or a description of a person and their
connections with other people which was found through Google. Given common traits such research projects worked on together, similar research interests, similar education,
such as school or research group both people are in, etc., give me connections between people in this format:
First output all of the names found in the text inputted.
1. <NAME>
- Description of person with name <NAME>
Then follow the format below to show connections between the people in the input text:
"Input name".   "Output name"
This is an example of how the format:
```Jonathan Iverson ; Benjamin Doverson```
- description of relationship
The number of nodes should be (number of people * (number of people - 1)) / 2
'''

edges = generate(result, system_input)
print(edges)

### Names Found in the Text:
1. Ilya Sutskever
2. Geoffrey Hinton
3. Andrew Ng
4. Alex Krizhevsky
5. Elon Musk
6. Sam Altman
7. Greg Brockman
8. Daniel Gross
9. Daniel Levy
10. Oriol Vinyals
11. Quoc Viet Le
12. Jan Leike

### Descriptions of People:
1. **Ilya Sutskever**
   - Co-founder and Chief Scientist at OpenAI, notable for his work in machine learning and AI. He co-invented AlexNet and has collaborated with prominent figures like Geoffrey Hinton and Andrew Ng. He has also been involved in the development of GPT models and TensorFlow.

2. **Geoffrey Hinton**
   - A prominent figure in the AI domain, known for his work in deep learning. He was Ilya Sutskever's doctoral supervisor and collaborated with him on AlexNet.

3. **Andrew Ng**
   - A luminary in the AI field who mentored Ilya Sutskever during his postdoctoral position at Stanford University.

4. **Alex Krizhevsky**
   - Collaborated with Ilya Sutskever and Geoffrey Hinton on the development of AlexNet.

5. **Elon Musk**
  