# Objectives

1. **Search for candidate repos** (containing prompts in .py/.txt files) using:
    - LangChain
    - Guidance (by Microsoft)
    - LlamaIndex
2. **Find Prompts**:
    - 2.1. Filter down via dirs and files
        - Look at dirs for 'template' or 'prompt' folders (or files)
    - 2.2. Filter down via code search:    
        - imports some library like openai, hugginface, etc.
        - (are they in files? Strings?
        - How many use variables?
        - Do they concat, use f-strings, use format? Etc.)

<br />

**Possible Next Steps**:
- Skim through a random sample set of files manually
- Run professor's **sslim check tool** on them
    - Professor's tool: https://github.com/kpister/sllim (semantic analysis to detect errors in prompt files)

## 📚 **Candidate Repos**

In [39]:
import requests, json
from pprint import pprint

def fetch_data(query="langchain+OR+GUIDANCE+OR+LlamaIndex", sort="stars", order="asc", per_page=100, language="python"):
    """
    # GitHub API URL for searching repositories
    # DOCS: https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-repositories

    # Params Default
    query = "langchain+OR+GUIDANCE+OR+LlamaIndex"
    sort = "stars"
    order = "asc"
    per_page = 100  # Max 100
    language = "python"

    Returns a results dict with the following structure:
    {
        total_count: int,
        items: [{repo1_info}, {repo2_info}, ...]
    }
    """
    # Setting up result dict and file
    result = {"total_count": 0, "items": []}
    
    # NOTE: Only the first 1000 search results are available through this API
    print("Fetching all 10 pages (assuming there're >= 1000 results)")
    for page in range(1, 11):
        url = f"https://api.github.com/search/repositories?q={query}+language:{language}&sort={sort}&order={order}&per_page={per_page}&page={page}"
        # Make the API request and get the JSON response
        response = requests.get(url)
        data = response.json()

        # Check if the request was successful
        if response.status_code != 200:
            raise Exception(data.get("message", "Unknown error"))

        # Check if the API returned an error
        if "message" in data:
            raise Exception(data["message"])
        
        # Add the results to the result list and file
        result["items"].extend(data["items"])
        print(f"Page {page} done")

    # Add the total count to the result
    result["total_count"] = data["total_count"]

    return result

# WARNING: AVOID EXCEEDING THE API rate limit (10 times per minute).
# More Details in the DOCS.
# STORING RESULT (Uncomment when needed)
##################################################
# repos = fetch_data()
# with open("repos.json", "w") as file:
#     json.dump(repos, file, indent=4)
# pprint(repos)

## 🔎 **Find Prompts**

In [40]:
from utils import get_api_key_github  # custom function for privacy. Get your own API key from
import time

def search_repo(repo_name, username="DJPAUL2001", api_key=get_api_key_github()):
    """
    Searches a repo for promp or template file. HELPER FUNCTION for search_all_repos()
    Github API URL: https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code

    When searching for code, you can get text match metadata for the file content and file path fields when you pass 
    the text-match media type. For more details about how to receive highlighted search results

    Note: The "Search code" endpoint requires you to authenticate and limits you to 10 requests per minute

    Returns a list of files. e.g. [{file1_info}, {file2_info}, ...]
    """
    url = f"https://api.github.com/search/code?q=prompt+OR+template+in:file+language:text+OR+language:python+repo:{repo_name}"
    response = requests.get(url, auth=(username, api_key))
    data = response.json()

    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(data.get("message", "Unknown error"))
    
    # Check if the API returned an error
    if "message" in data:
        raise Exception(data["message"])
    
    return data["items"]

def search_all_repos(repos, username="DJPAUL2001", api_key=get_api_key_github()):
    """
    Running search_repo() on all candidate repos in repos.json. Approx runtime: > 1 hr 40 mins (took 115 mins)

    Returns a dict with the following structure:
    {
        repo_name1: [{file1_info}, {file2_info}, ...],
        repo_name2: [{file1_info}, {file2_info}, ...],
        ...
    }
    """
    repos_prompts = {}
    for repo in repos["items"]:
        repo_name = repo["full_name"]
        try:
            repos_prompts[repo_name] = search_repo(repo_name, username, api_key)
        except Exception as e:
            print(f"Error: {e}")
            # Assuming the exception is due to rate limit, wait for 1 minute and try again
            time.sleep(60)
            repos_prompts[repo_name] = search_repo(repo_name, username, api_key)

    return repos_prompts

# STORING RESULT (Uncomment when needed)
##################################################
# # Loading repos.json to memory
# with open("repos.json", "r") as file:
#     repos = json.load(file)

# repos_prompts = search_all_repos(repos)
# with open("repos_prompts.json", "w") as file:
#     json.dump(repos_prompts, file, indent=4)
# pprint(repos_prompts)

🗨️ Number of Repos with Prompts

In [41]:
# Loading prompts search results from all repos
with open("repos_prompts.json", "r") as file:
    repos_prompts = json.load(file)

# Counting number of repos with prompt/template search results
count = 0
for repo_name, files in repos_prompts.items():
    if len(files) > 0:
        count += 1

print(f"Number of repos with prompts: {count} out of {len(repos_prompts)}")

Number of repos with prompts: 101 out of 1000


📂 Grabbing and Storing all raw file URLs in one place

In [52]:
def get_rawFileURL():
    """
    NOTE: To get raw file content: 
    1. grab the html_url of a code_search item
    2. remove blob/
    3. replace with github.com with raw.githubusercontent.com

    e.g.
    res = requests.get('https://raw.githubusercontent.com/langchain-ai/langchain/98aff29fbda6bcb99ea6af0cfd1532954b504bdc/libs/langchain/langchain/schema/prompt_template.py')
    print(res.text)

    
    Returns a dict with the following structure (Note: Only contains repos with prompt/template files):
    {
        repo_name1: [file_raw_url1, file_raw_url2, ...],
        repo_name2: [file_raw_url1, file_raw_url2, ...],
        ...
    }
    """
    repo_to_fileURL = {}
    for repo, files in repos_prompts.items():
        for file in files:
            file_raw_url = file["html_url"].replace("blob/", "").replace("github.com", "raw.githubusercontent.com")
            repo_to_fileURL[repo] = repo_to_fileURL.get(repo, []) + [file_raw_url]

    # STORING RESULT
    with open("repo_to_rawFileURL.json", "w") as file:
        json.dump(repo_to_fileURL, file, indent=4)

    return repo_to_fileURL

get_rawFileURL()

{'PankajBhatia06/LangChainBot': ['https://raw.githubusercontent.com/PankajBhatia06/LangChainBot/395ab40a0c975f55e14fef6ae1b23c4079c19f20/app.py'],
 'Data-drone/langchain_tests': ['https://raw.githubusercontent.com/Data-drone/langchain_tests/dfcbf08437a668ff5ddbb76f26e838a39b7fe9ec/app/doc_chatbot.py'],
 'HardyZ1906/evadb-llamaindex-bot': ['https://raw.githubusercontent.com/HardyZ1906/evadb-llamaindex-bot/337f77df4be8eede78a61e961519200f27676f4d/helper_bot.py'],
 'sharmaD91/LlamaIndex-Tutorial': ['https://raw.githubusercontent.com/sharmaD91/LlamaIndex-Tutorial/a44e86949dad1c5403fd2566eb9ed2dbcb68afa9/app.py'],
 'yudai1204/LlamaIndex-LineBot': ['https://raw.githubusercontent.com/yudai1204/LlamaIndex-LineBot/c6813479ada813bff8b351e148d24d2ff68bd0ab/app.py'],
 'Krishnendumondal/Langchain': ['https://raw.githubusercontent.com/Krishnendumondal/Langchain/ba622764cb7ccf4667878289f959857348ef8c19/langchain/chains/query_constructor/prompt.py',
  'https://raw.githubusercontent.com/Krishnendumonda