In [1]:
import sys, os
import json

# sys.version, sys.path

from gpt_index import GPTSimpleVectorIndex
from gpt_index import SimpleDirectoryReader
from IPython.display import Markdown, display

In [2]:
def load_json(path):
    with open(path, 'r') as f:
        d_ = json.load(f)
    return d_

In [3]:
def get_auth(path=''):
    auth_path = './config/auth.json'
    if not path:
        path = auth_path
    return load_json(path)

In [4]:
auth = get_auth()

In [5]:
os.environ['OPENAI_API_KEY'] = get_auth()['token']

# index

## create index

In [6]:
documents = SimpleDirectoryReader('data').load_data()

In [7]:
type(documents), len(documents)

(list, 6)

In [8]:
index = GPTSimpleVectorIndex(documents)
index.save_to_disk('index_file/the_merge.json')

> Adding chunk: There are two types of Ethereum nodes: nodes th...
> Adding chunk: Gas fees are a product of network demand relati...
> Adding chunk: A transaction's "speed" can be measured in a fe...
> Adding chunk: Staked ETH and staking rewards continue to be l...
> Adding chunk: This may seem counterintuitive to the above not...
> Adding chunk: After the Shanghai upgrade enables withdrawals,...
> [build_index_from_documents] Total LLM token usage: 0 tokens
> [build_index_from_documents] Total embedding token usage: 1244 tokens


## load index

In [8]:
index = GPTSimpleVectorIndex.load_from_disk('./index_file/index_my.json')

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

# predict

In [16]:
len(documents[0].text)

9494

In [13]:
len(index.index_struct.nodes_dict)

5

In [61]:
from langchain import OpenAI
from gpt_index import LLMPredictor

In [78]:
# llm_predictor_high = OpenAI(temperature=0, model_name="text-davinci-003")
llm_predictor_high = LLMPredictor(OpenAI(temperature=0, model_name="text-davinci-003"))
# llm_predictor_cheep = OpenAI(temperature=0, model_name="text-ada-001")
# llm_predictor_cheep = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-ada-001"))
# llm_predictor_cheep = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-babbage-001"))
llm_predictor_cheep = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-curie-001"))

In [79]:
index = GPTSimpleVectorIndex.load_from_disk('./index_file/index.json', llm_predictor=llm_predictor_cheep)

In [80]:
index.llm_predictor.get_llm_metadata()

LLMMetadata(max_input_size=2048, num_output=256)

In [17]:
import datetime

In [18]:
n = datetime.datetime.now()

In [20]:
n.strftime('%Y%m%d %H%M%S')

'20230208 035521'

### curie

In [81]:
index._llm_predictor._llm.model_name

'text-curie-001'

In [82]:
response = index.query("What did the author do growing up?")
print(response)

> [query] Total LLM token usage: 4221 tokens
> [query] Total embedding token usage: 8 tokens



The author attended RISD in the US and then applied to the Accademia di Belli Arti in Florence, Italy. The Accademia accepted the author, and they never heard back from RISD. The author then attended the Accademia di Belli Arti in Florence, Italy and passed the entrance exam.


### babbage

In [75]:
index._llm_predictor._llm.model_name

'text-babbage-001'

In [76]:
response = index.query("What did the author do growing up?")
print(response)

> [query] Total LLM token usage: 4430 tokens
> [query] Total embedding token usage: 8 tokens


The author was accepted to the Accademia di Belli Arti in Florence, which is where they learned about art and applied it to their own work.


### ada

In [68]:
index._llm_predictor._llm.model_name

'text-ada-001'

In [69]:
response = index.query("What did the author do growing up?")
print(response)

> [query] Total LLM token usage: 4059 tokens
> [query] Total embedding token usage: 8 tokens


I never heard back from the Accademia, so off to Providence I went.


In [71]:
response.source_nodes[0].node_info

{'start': 0, 'end': 14461}

In [46]:
n_info = response.source_nodes[0].node_info

In [47]:
index.docstore.get_document(response.source_nodes[0].doc_id).text[n_info['start']: n_info['end']]

'\t\t\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch cards, th

### davinci

In [None]:
response = index.query("What did the author do growing up?")
print(response)

Token indices sequence length is longer than the specified maximum sequence length for this model (3221 > 1024). Running this sequence through the model will result in indexing errors


> [query] Total LLM token usage: 3841 tokens
> [query] Total embedding token usage: 8 tokens


The author grew up writing short stories, programming on an IBM 1401, and building a computer kit from Heathkit. He also wrote simple games, a program to predict how high his model rockets would fly, and a word processor. He studied philosophy in college, but switched to AI and taught himself Lisp. He wrote a book about Lisp hacking and reverse-engineered SHRDLU. He also took art classes at Harvard and applied to art schools, but was disappointed by the lack of teaching and learning in the painting department at the Accademia. He also had experience with 19th century studio painting conventions, such as having a little stove fed with kindling and a nude model sitting as close to it as possible.


In [None]:
response = index.query("When does the author's mother die?")
print(response)

> [query] Total LLM token usage: 3583 tokens
> [query] Total embedding token usage: 8 tokens


The author's mother does not die in the context information provided.


In [None]:
response = index.query("Who dead in 2014?")
print(response)

> [query] Total LLM token usage: 1422 tokens
> [query] Total embedding token usage: 5 tokens

No one is mentioned as having died in 2014 in the given context information.


In [None]:
response = index.query("What is YC?")
print(response)

> [query] Total LLM token usage: 3796 tokens
> [query] Total embedding token usage: 5 tokens


YC is a startup accelerator founded by Paul Graham, Robert Morris, Jessica Livingston, and Trevor Blackwell in 2005. It provides seed funding, advice, and connections to help startups get their initial set of customers almost entirely from among their batchmates, with the goal of helping them succeed in an increasingly competitive market where Moore's Law has made it difficult for many companies to survive. YC's signature style of investing has made it a popular choice for entrepreneurs looking to make their mark in the art world, where money and coolness are tightly coupled. YC also offers rent-stabilized apartments to its members, providing them with a cost-effective way to live and work in the city. Finally, YC's accelerator program is designed to help startups launch their software quickly and efficiently.


In [None]:
response.get_formatted_sources()

'> Source (Doc id: d0867305-c29f-475a-ab5d-60de01ae6f1c): get their initial set of customers almost entirely from among their batchmates.\n\nI had not origin...'

In [None]:
len(response.source_nodes)

1

In [None]:
response.source_nodes

[SourceNode(source_text='get their initial set of customers almost entirely from among their batchmates.\n\nI had not originally intended YC to be a full-time job. I was going to do three things: hack, write essays, and work on YC. As YC grew, and I grew more excited about it, it started to take up a lot more than a third of my attention. But for the first few years I was still able to work on other things.\n\nIn the summer of 2006, Robert and I started working on a new version of Arc. This one was reasonably fast, because it was compiled into Scheme. To test this new Arc, I wrote Hacker News in it. It was originally meant to be a news aggregator for startup founders and was called Startup News, but after a few months I got tired of reading about nothing but startups. Plus it wasn\'t startup founders we wanted to reach. It was future startup founders. So I changed the name to Hacker News and the topic to whatever engaged one\'s intellectual curiosity.\n\nHN was no doubt good for YC, bu

In [None]:
response = index.query("What did the author do growing up?", verbose=True)
print(response)

> Top 1 nodes:
> [Node 12d686d7-d58d-49c4-8c42-a7eaa8b8a8e0] [Similarity score:                     0.819917] 		

What I Worked On

February 2021

Before college the two main things I worked on, outside of s...
> Searching in chunk: 		

What I Worked On

February 2021

Before col...
> Initial response: 
The author grew up writing short stories, programming on an IBM 1401, and building a computer kit with a friend. He also wrote simple games, a program to predict how high his model rockets would fly, and a word processor. He studied philosophy in college, but switched to AI and taught himself Lisp. He wrote a book about Lisp hacking and reverse-engineered SHRDLU. He also took art classes at Harvard and applied to art schools.
> Refine context: limited vocabulary. [2]

I'm only up to age 25 ...
> Refined response: 

The author grew up writing short stories, programming on an IBM 1401, and building a computer kit with a friend. He also wrote simple games, a program to predict how high his

In [None]:
??index

In [None]:
index._embed_model

<gpt_index.embeddings.openai.OpenAIEmbedding at 0x7f10efabda90>

In [18]:
a = '''
as doing great. But if there was one thing rarer than Rtm offering advice, it was Rtm being wrong. So this set me thinking. It was true that on my current trajectory, YC would be the last thing I did, because it was only taking up more of my attention. It had already eaten Arc, and was in the process of eating essays too. Either YC was my life\'s work or I\'d have to leave eventually. And it wasn\'t, so I would.\n\nIn the summer of 2012 my mother had a stroke, and the cause turned out to be a blood clot caused by colon cancer. The stroke destroyed her balance, and she was put in a nursing home, but she really wanted to get out of it and back to her house, and my sister and I were determined to help her do it. I used to fly up to Oregon to visit her regularly, and I had a lot of time to think on those flights. On one of them I realized I was ready to hand YC over to someone else.\n\nI asked Jessica if she wanted to be president, but she didn\'t, so we decided we\'d try to recruit Sam Altman. We talked to Robert and Trevor and we agreed to make it a complete changing of the guard. Up till that point YC had been controlled by the original LLC we four had started. But we wanted YC to last for a long time, and to do that it couldn\'t be controlled by the founders. So if Sam said yes, we\'d let him reorganize YC. Robert and I would retire, and Jessica and Trevor would become ordinary partners.\n\nWhen we asked Sam if he wanted to be president of YC, initially he said no. He wanted to start a startup to make nuclear reactors. But I kept at it, and in October 2013 he finally agreed. We decided he\'d take over starting with the winter 2014 batch. For the rest of 2013 I left running YC more and more to Sam, partly so he could learn the job, and partly because I was focused on my mother, whose cancer had returned.\n\nShe died on January 15, 2014. We knew this was coming, but it was still hard when it did.
'''

In [20]:
print(a)


as doing great. But if there was one thing rarer than Rtm offering advice, it was Rtm being wrong. So this set me thinking. It was true that on my current trajectory, YC would be the last thing I did, because it was only taking up more of my attention. It had already eaten Arc, and was in the process of eating essays too. Either YC was my life's work or I'd have to leave eventually. And it wasn't, so I would.

In the summer of 2012 my mother had a stroke, and the cause turned out to be a blood clot caused by colon cancer. The stroke destroyed her balance, and she was put in a nursing home, but she really wanted to get out of it and back to her house, and my sister and I were determined to help her do it. I used to fly up to Oregon to visit her regularly, and I had a lot of time to think on those flights. On one of them I realized I was ready to hand YC over to someone else.

I asked Jessica if she wanted to be president, but she didn't, so we decided we'd try to recruit Sam Altman. We