In [13]:
import os 
import sys
import csv

from langchain.llms import OpenAI
# from langchain.chat_models import ChatOpenAI
# from langchain.agents.agent_types import AgentType

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

csv.field_size_limit(sys.maxsize)

131072

In [73]:
def load_api_key(file_path):
    with open(file_path, 'r') as file:
        api_key = file.read().strip()
    #set as environ variable 
    os.environ['OPENAI_API_KEY'] = api_key
    print('Loaded the key to environment successfully')
    # return api_key

load_api_key('openai_api_key.csv')

Loaded the key to environment successfully


In [11]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)
from langchain.document_loaders.csv_loader import CSVLoader

## Load Data

In [62]:
loader = CSVLoader(
                file_path='data/DAO_db.csv', 
                csv_args={
                      # "delimiter":",",
                      # "fieldnames":['has_tokens'],
                      },
                   source_column="updated_at"
    
                  )
data = loader.load()

## TODO
Create a parser to convert CSV row to a doc. Extract following fields - 
1. date of creation / updation 
2. name
3. description
4. any other meta ?

In [67]:
import json 
import ast 

# ast.literal_eval(data[0].page_content)
data[1].metadata

{'source': '2023-06-13T11:09:57.805Z', 'row': 1}

In [38]:
import pandas as pd 

df = pd.read_csv("data/DAO_db.csv")

In [81]:
# df['description']
# df[:100]

In [69]:
import pandas as pd
from llama_index.query_engine import PandasQueryEngine

In [75]:
query_engine = PandasQueryEngine(df=df[:10], verbose=True)

In [79]:
response = query_engine.query(
    "Which dao is related to governance improvement?",
)

> Pandas Instructions:
```

df[df['governance'].str.contains('improvement')]['_id']
```
> Pandas Output: Series([], Name: _id, dtype: object)


In [78]:
df[:10]['votes']

0      NaN
1      2.0
2      7.0
3      NaN
4     90.0
5    152.0
6     54.0
7      NaN
8      NaN
9    347.0
Name: votes, dtype: float64

## Convert data into chunks

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
pages = text_splitter.split_text(data[5].page_content)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.create_documents(pages)

len(texts)

In [None]:
texts

## Embedd the documents in the vectorstor 

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
vectorstore = Chroma.from_documents(data,embedding=embeddings, persist_directory="./chroma_db_pdf")

In [None]:
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(openai_api_key=OPENAI_API_KEY), 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True,
    verbose=True)

In [None]:
def print_res(res):
    print(res['result'])
    print("Docs::")
    for i, doc in enumerate(res["source_documents"]):
        print(f'{i}')
        print(doc.page_content)

In [None]:
query = "What all are covered in hospitalization expenses?"
result = qa({"query": query})

print_res(result)

In [None]:
result

In [None]:
query = "How to use neural networks to train Alexnet?"
result_02 = qa({"query": query})


## Create custom document

A custom document class is defined as 

```python
document = Document(
    text='text', 
    metadata={
        'filename': '<doc_file_name>', 
        'category': '<category>'
    }
)
```

In [86]:
filtered_df = df[['_id','created_at','name','description']]

In [121]:
def cleanup(s):
    if isinstance(s,int) or isinstance(s,float): 
        return False 
    
    s = s.replace('DeepDAO research coming soon','')
    
    return len(s)>0


filtered_df['valid'] = filtered_df['description'].apply(cleanup)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['valid'] = filtered_df['description'].apply(cleanup)


In [124]:
filtered_df = filtered_df[filtered_df['valid']]

In [85]:
from llama_index import ListIndex, Document

In [138]:
index = ListIndex([])
# text_chunks = 

doc_chunks = []
for i, row in filtered_df[filtered_df['description'].notnull()].iterrows():
    
    doc = Document(
        page_content=row['description'], 
        metadata= {
            'id_':row['_id'], 
            'created_at':row['created_at'],
            'name':str(row['name']).lower()
            }
        )
    doc_chunks.append(doc)

# # insert
# for doc_chunk in doc_chunks:
#     index.insert(doc_chunk)

In [101]:
from llama_index.node_parser import SimpleNodeParser

In [102]:
parser = SimpleNodeParser()

nodes = parser.get_nodes_from_documents(doc_chunks)

In [106]:
for i in range(10):
    print(nodes[i].text)

10,000 Zoo Gang on the Ethereum blockchain. With 20% of the Mint going to towards investing in Blue Chip NFTs
DeepDAO research coming soon
New Decentralized Hedge Fund
NFTorbit.io is NFT marketplace on binance smart chain, Govern by ORBI token for DAO
DeepDAO research coming soon
DeepDAO research coming soon
DeepDAO research coming soon
DeepDAO research coming soon
DeepDAO research coming soon
DeepDAO research coming soon


In [109]:
# from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext
# from langchain import OpenAI


# # define LLM
# llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))

# # configure service context
# service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

# # build index
# # index = VectorStoreIndex.from_documents(
# #     documents, service_context=service_context
# # )

## Create the embedding and service context

In [133]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from llama_index import LangchainEmbedding, VectorStoreIndex, ServiceContext

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load in HF embedding model from langchain
embed_model = LangchainEmbedding(embedding_function)
service_context = ServiceContext.from_defaults(embed_model=embed_model )

In [134]:
# os.environ['OPENAI_API_KEY'] = "random"
# SentenceTransformerEmbeddings

langchain.embeddings.huggingface.HuggingFaceEmbeddings

In [None]:
new_index = ListIndex.from_documents(doc_chunks)

# query with embed_model specified
query_engine = new_index.as_query_engine(
    retriever_mode="embedding", 
    verbose=True, 
    service_context=service_context
)

In [None]:
response = query_engine.query("Which DAO is related to finance?")

In [141]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(filtered_df, page_content_column="description")

data = loader.load()

In [142]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

# load the document and split it into chunks

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(data, embedding_function)

# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions 
# qa = RetrievalQA.from_chain_type(
#     llm=OpenAI(openai_api_key=OPENAI_API_KEY), chain_type="stuff", retriever=retriever, return_source_documents=True, verbose=True)

In [144]:
# query it
query = "Zoo Gang"
resp = vectorstore.similarity_search(query)

resp
# print results
# print(resp[0].page_content)

[Document(page_content='BROWN', metadata={'_id': 'H4U7lDU6QEW/UdVpzcM8fQ==', 'created_at': '2023-06-13T11:10:01.138Z', 'name': 'Matt', 'valid': True}),
 Document(page_content='About Junjie', metadata={'_id': 'HDGZodcPRTuKD7Ia4C9IcQ==', 'created_at': '2023-06-13T11:10:51.572Z', 'name': "Junjie's DAO", 'valid': True}),
 Document(page_content='about shinj', metadata={'_id': 'WGLPp8cPSDypiadhkdmwSQ==', 'created_at': '2023-06-13T11:11:02.977Z', 'name': 'shinj project', 'valid': True}),
 Document(page_content='Primo Kimchi governance', metadata={'_id': '56E5QoweSKGh4ZMTHawIbw==', 'created_at': '2023-06-13T11:11:05.360Z', 'name': 'Kimchi Premium', 'valid': True})]

In [145]:
data[0]

Document(page_content='10,000 Zoo Gang on the Ethereum blockchain. With 20% of the Mint going to towards investing in Blue Chip NFTs', metadata={'_id': 'i5F0UWsNT9+dEHOQvHBlgw==', 'created_at': '2023-06-13T11:09:57.803Z', 'name': 'ZooGang Dao', 'valid': True})

In [150]:
vectorstore.search

AttributeError: 'Chroma' object has no attribute 'query'