In [1]:
import os
import git
import pathlib
import subprocess

import meerkat as mk
from rich import print

cwd = os.getcwd()
mk.gui.start(api_port=5032, frontend_port=8010, skip_build=True)
os.chdir(cwd)

In [2]:
REPO = "https://github.com/hazyresearch/meerkat"
subprocess.run(["git", "clone", REPO])

DIR = "./meerkat/"

Cloning into 'meerkat'...


In [3]:
# Create a Git repository object
repo = git.Repo(DIR)

# Get the list of files of interest
files = repo.git.ls_files("--exclude-standard", "--cached", "--modified", "--other").splitlines()
paths = [os.path.join(DIR, f) for f in files]

# For each file, get {'filename', 'len', 'extension'}
files = [{'filename': f, 'nchars': os.path.getsize(f), 'extension': pathlib.Path(f).suffix} for f in paths]

In [5]:
project = mk.DataFrame(files)
# Add a column that contains the actual file
project['files'] = mk.files(project['filename'], type="code")

# Go through all the files in order, keep track of failed loads and their extensions
failed_extensions = set()
for i in range(len(project)):
    try:
        project["files"][i]()
    except Exception:
        failed_extensions.add(project["extension"][i])

# Make a list of image and pdf file extensions
remove_extensions = [
    ".png",
    ".jpg",
    ".jpeg",
    ".gif",
    ".svg",
    ".pdf",
    ".ico",
    "",
    ".dia",
    ".odg",
    ".pkl",
    ".npz",
    ".fits",
    ".mod",
    ".swg",
    ".star",
    ".npy",
] + list(failed_extensions)
# remove_extensions = set(remove_extensions) # this doesn't work?!

# Exclude files that are images or pdfs
project = project.filter(lambda extension: extension not in remove_extensions, pbar=True)

# Exclude files that are empty
project = project.filter(lambda row: row['nchars'] > 0, materialize=False, pbar=True)

100%|██████████| 760/760 [00:01<00:00, 455.87it/s]
100%|██████████| 735/735 [00:04<00:00, 174.52it/s]


In [6]:
import tiktoken
get_token_count = lambda files: [len(e) for e in tiktoken.get_encoding("gpt2").encode_batch(files)]
project['ntokens'] = project['files'].map(get_token_count, batch_size=128, is_batched_fn=True, pbar=True)
print("Total Tokens: {}".format(sum(project['ntokens'])))

100%|██████████| 6/6 [00:00<00:00,  9.43it/s]


In [7]:
project.create_primary_key("file_id")
project

In [152]:
from functools import partial
from typing import Callable, List

def explode(df: mk.DataFrame, chunk_col: str, chunker: Callable, batch_size: int = 1) -> mk.DataFrame:
    """Chunk each row of a DataFrame into multiple rows, and concatenate the results."""
    # Chunk each row of the DataFrame
    chunks = df.map(chunker, batch_size=batch_size, is_batched_fn=batch_size > 1, pbar=True, inputs={chunk_col: 'files'})
    df['chunks'] = chunks
    
    # Make a df on each row, propagate the other columns
    chunk_dfs = df.map(
        lambda row: mk.DataFrame({
            'chunk': row['chunks'], 
            'chunk_idx': list(range(1, len(row['chunks']) + 1)),
            **{k: [v] * len(row['chunks']) for k, v in row.items() if k in ['filename', 'file_id'] }
        }), 
        pbar=True,
    )
    
    # Concatenate the results
    return mk.concat(chunk_dfs)

def chunker(files: List[str], toksize: int = 2048) -> List[str]:
    """Split each file into chunks of size toksize."""
    # Get the encoding
    encoding = tiktoken.get_encoding("gpt2")
    # Tokenized files
    tokens = encoding.encode_batch(files)
    # Split each file into chunks of size toksize
    splits = [[encoding.decode(e[pos:pos + toksize]) for pos in range(0, len(e), toksize)] for e in tokens]
    return splits

chunk_df = explode(project, 'files', partial(chunker, toksize=2048), batch_size=16)

100%|██████████| 44/44 [00:01<00:00, 36.92it/s]
100%|██████████| 695/695 [00:11<00:00, 60.53it/s]


In [153]:
chunk_df['chunk'] = chunk_df['chunk'].format(mk.format.CodeFormatterGroup())
chunk_df.create_primary_key("chunk_id")
chunk_df

In [None]:
import openai
import cohere
openai.api_key = "sk-xxx"
co = cohere.Client("xxx")

In [187]:
# model = "cohere/large"
model = "openai/text-embedding-ada-002"

In [180]:
def embed(text, model="openai/text-embedding-ada-002"):
   text = text.replace("\n", " ")
   if model.startswith("openai"):
      response = openai.Embedding.create(input = [text], model=model.replace("openai/", ""))
      return response['data'][0]['embedding']
   elif model.startswith("cohere"):
      response = co.embed(texts=[text], model=model.replace("cohere/", ""))
      return response.embeddings[0]

def embed_many(texts, model="openai/text-embedding-ada-002"):
   texts = [t.replace("\n", " ") for t in texts]
   if model.startswith("openai"):
      response = openai.Embedding.create(input=texts, model=model.replace("openai/", ""))
      return [response['data'][i]['embedding'] for i in range(len(texts))]
   elif model.startswith("cohere"):
      response = co.embed(texts=texts, model=model.replace("cohere/", ""))
      return response.embeddings

# Embed each chunk for retrieval
chunk_df[f'embeddings/{model}'] = chunk_df.map(lambda chunk: embed_many(chunk, model=model), pbar=True, batch_size=128, is_batched_fn=True, output_type=mk.TensorColumn)

100%|██████████| 11/11 [01:07<00:00,  6.13s/it]


In [155]:
# Add token counts
chunk_df['ntokens'] = chunk_df['chunk'].map(get_token_count, batch_size=16, is_batched_fn=True, pbar=True)

100%|██████████| 87/87 [00:00<00:00, 129.18it/s]


In [185]:
chunk_df.write('/Users/krandiash/Desktop/workspace/projects/meerkat-dev/mkdev/scratch/karan/chatbot/meerkat-chunks.mk')
project.write('/Users/krandiash/Desktop/workspace/projects/meerkat-dev/mkdev/scratch/karan/chatbot/meerkat-project.mk')

In [176]:
chunk_df

In [191]:
def search(df, query, n=10, embedding_col: str = f'embeddings/{model}'):
    # Embed the query
    query_embedding = embed(query, model=model)
    # Compute the cosine similarity between the query and each chunk
    similarities = df[embedding_col].dot(query_embedding)
    # Sort the chunks by similarity
    df['similarity'] = similarities
    df = df.sort('similarity', ascending=False)
    # Return the top n results
    return df.head(n)

In [192]:
def truncate(text: str, ntokens: int):
    """Truncate a string to a number of tokens."""
    encoding = tiktoken.get_encoding("gpt2")
    tokens = encoding.encode(text)[:ntokens]
    print(f"Truncated to {len(tokens)} tokens")
    return encoding.decode(tokens)

In [193]:
def template(instruction, query, context):
    return f"""
{instruction}

Query: {query}

Relevant Context:
{context}

Helpful Response:\
"""

def create_prompt(df, query, n=2, max_tokens=6144):
    # Search for the query
    results = search(df, query, n=n)
    # Create the prompt
    instruction = "Please provide a helpful response to the following query using the provided context. Your response should be well formatted, and can include code snippets."
    context = truncate("\n\n".join(results['chunk']), max_tokens)
    return template(instruction, query, context)

In [194]:
query = "How do I create an interface that contains a scatter plot and a table in Python?"
prompt = create_prompt(chunk_df, query, n=8)
mk.gui.html.flexcol([
    mk.gui.Header("Copy the prompt and paste it into GPT-4!"), 
    mk.gui.CopyButton(value=prompt),
    # mk.gui.Markdown(prompt),
], classes="items-center")