In [None]:
!pip install --upgrade langchain deeplake openai tiktoken

In [1]:
import os
from langchain.document_loaders import TextLoader

root_dir = './langchain/langchain/'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try: 
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e: 
            pass

print(len(docs))

944


In [2]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

print(len(texts))

Created a chunk of size 1620, which is longer than the specified 1000
Created a chunk of size 1273, which is longer than the specified 1000
Created a chunk of size 1452, which is longer than the specified 1000
Created a chunk of size 1213, which is longer than the specified 1000
Created a chunk of size 1263, which is longer than the specified 1000
Created a chunk of size 1197, which is longer than the specified 1000
Created a chunk of size 1375, which is longer than the specified 1000
Created a chunk of size 1364, which is longer than the specified 1000
Created a chunk of size 1133, which is longer than the specified 1000
Created a chunk of size 1825, which is longer than the specified 1000
Created a chunk of size 1336, which is longer than the specified 1000
Created a chunk of size 1525, which is longer than the specified 1000
Created a chunk of size 1611, which is longer than the specified 1000
Created a chunk of size 1268, which is longer than the specified 1000
Created a chunk of s

2957


In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

embeddings = OpenAIEmbeddings()
username = "kun" # replace with your username from app.activeloop.ai
db = DeepLake(dataset_path=f"hub://{username}/langchain", embedding_function=embeddings, public=True)
db.add_documents(texts)


Your Deep Lake dataset has been successfully created!


-

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/kun/langchain


 

hub://kun/langchain loaded successfully.


Evaluating ingest: 100%|██████████| 3/3 [00:50<00:00
 

Dataset(path='hub://kun/langchain', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape       dtype  compression
  -------   -------    -------     -------  ------- 
 embedding  generic  (2957, 1536)  float32   None   
    ids      text     (2957, 1)      str     None   
 metadata    json     (2957, 1)      str     None   
   text      text     (2957, 1)      str     None   


['94988f62-ec28-11ed-855e-52cef717896e',
 '94989142-ec28-11ed-855e-52cef717896e',
 '9498919c-ec28-11ed-855e-52cef717896e',
 '949891e2-ec28-11ed-855e-52cef717896e',
 '94989228-ec28-11ed-855e-52cef717896e',
 '94989264-ec28-11ed-855e-52cef717896e',
 '949892be-ec28-11ed-855e-52cef717896e',
 '949892fa-ec28-11ed-855e-52cef717896e',
 '9498932c-ec28-11ed-855e-52cef717896e',
 '94989368-ec28-11ed-855e-52cef717896e',
 '949893a4-ec28-11ed-855e-52cef717896e',
 '949893e0-ec28-11ed-855e-52cef717896e',
 '9498941c-ec28-11ed-855e-52cef717896e',
 '94989458-ec28-11ed-855e-52cef717896e',
 '94989494-ec28-11ed-855e-52cef717896e',
 '949894d0-ec28-11ed-855e-52cef717896e',
 '9498950c-ec28-11ed-855e-52cef717896e',
 '94989548-ec28-11ed-855e-52cef717896e',
 '94989584-ec28-11ed-855e-52cef717896e',
 '949895c0-ec28-11ed-855e-52cef717896e',
 '949895fc-ec28-11ed-855e-52cef717896e',
 '9498962e-ec28-11ed-855e-52cef717896e',
 '9498966a-ec28-11ed-855e-52cef717896e',
 '949896a6-ec28-11ed-855e-52cef717896e',
 '949896e2-ec28-

In [6]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

def filter(x):
    if 'com.google' in x['text'].data()['value']:
        return False
    metadata = x['metadata'].data()['value']
    return 'scala' in metadata['source'] or 'py' in metadata['source']

# Uncomment the following line to apply custom filtering
# retriever.search_kwargs['filter'] = filter


In [12]:
from langchain.chat_models import PromptLayerChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = PromptLayerChatOpenAI(pl_tags=['source-code-qa'], temperature=0, model='gpt-3.5-turbo') # switch to 'gpt-4'
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)

In [15]:
questions = [
    # "如何获取OpenAI调用消耗？",
    # "get_openai_token_cost_for_model 如何使用？",
    "支持的LLMs有哪些？",
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")


-> **Question**: 支持的LLMs有哪些？ 

**Answer**: 以下是支持的LLMs列表：

- AI21
- AlephAlpha
- Anthropic
- Banana
- BaseLLM
- CerebriumAI
- Cohere
- DeepInfra
- ForefrontAI
- GooglePalm
- GooseAI
- GPT4All
- HuggingFaceEndpoint
- HuggingFaceHub
- HuggingFacePipeline
- LlamaCpp
- Modal
- NLPCloud
- AzureOpenAI
- OpenAI
- OpenAIChat
- Petals
- PipelineAI
- PredictionGuard
- PromptLayerOpenAI
- PromptLayerOpenAIChat
- Replicate
- RWKV
- SagemakerEndpoint
- SelfHostedPipeline
- SelfHostedHuggingFaceLLM
- StochasticAI
- Writer 



In [10]:
from langchain.llms import get_openai_token_cost_for_model

model_name = "text-davinci-003"
num_tokens = 1024
is_completion = False

cost = get_openai_token_cost_for_model(model_name, num_tokens, is_completion)

print(f"Cost for {num_tokens} tokens with model {model_name}: ${cost:.2f}")

ImportError: cannot import name 'get_openai_token_cost_for_model' from 'langchain.llms' (/Users/guokun/code/python/p38/lib/python3.8/site-packages/langchain/llms/__init__.py)

## 小节 2023年05月07日
AI满嘴胡话，问三个问题，只回答对了0.8个。

让AI辅助开发，我能想到几个方向：
1. 生成代码
2. 生成注释、文档
3. 生成单元测试、辅助定位问题

至于使用AI辅助理解项目代码，我想主要就是做第2个方向，辅助总结代码、生成文档，AI还没有智能到扔一个代码库给它，就能开始写代码的地步。

除了以上的方向，辅助设计技术方案，可能也是一个方向