In [1]:
# What is Nomic GPR4ALLEMBEDDINGS? 
# - store to vector database 

# Vector databases are used in Low-Latency Machine Applications (LLMs) to provide additional information that LLMs have not been trained on. 
# - TODO: can we use previously successful set of (pig_code, pyspark_code, sample_data), store them to vector DB and use that for future code gen? 
# 

---

## 0. Initial Setup 

* generate LangSmith API key.
* TODO: How to safely save and load API keys
* https://docs.smith.langchain.com/

In [2]:
langchain_api_key = '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'

In [3]:
# tracing progress

import os 

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key

In [4]:
# configure 
run_local = "Yes"
# local_llm = "mistral" # mistral: https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_crag_local.ipynb
# local_llm = "mixtral"  # mixtral: https://scalastic.io/en/mixtral-ollama-llamaindex-llm/
local_llm = "llama3" # llama3: https://python.langchain.com/docs/integrations/chat/ollama/


In [5]:
import os

# Get the list of all files and directories in the current working directory
files_and_directories = os.listdir()

print(files_and_directories)

['docker-compose.yml', 'Dockerfile', 'file_handler.py', 'faq_generator.ipynb', 'setup-env.sh', 'pig2pyspark_generator.ipynb', '.Trash-0', 'requirements.txt', 'langgraph2.ipynb', 'coding', 'scripts', '.ipynb_checkpoints', '.gitignore', 'output', 'data', '.cache', '.git']


---

## RAG (Index?) - Uplaod Supporting Documents 
Load data: https://python.langchain.com/docs/integrations/document_loaders/recursive_url/

We will pull AWS EC2 docs: https://docs.aws.amazon.com/ec2/

In [6]:
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup

# from langchain_community.document_loaders import WebBaseLoader # this is for pulling 
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
# # from langchain_mistralai import MistralAIEmbeddings

# Load
url = "https://help.netflix.com/en/node/102377"
loader = RecursiveUrlLoader(
    url=url, max_depth=5, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()

In [7]:
len(docs)

1

In [None]:
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=100
)
all_splits = text_splitter.split_documents(docs)

In [None]:
# Embed and index
if run_local == "Yes":
    embedding = GPT4AllEmbeddings()
else:
    # embedding = MistralAIEmbeddings(mistral_api_key=mistral_api_key)
    pass

# Index
vectorstore = Chroma.from_documents(
    documents=all_splits,
    collection_name="rag-chroma",
    embedding=embedding,
)
retriever = vectorstore.as_retriever()

In [None]:
# sample HTML from 
import requests
from bs4 import BeautifulSoup

# URL of the webpage you want to fetch
url = 'https://network.mobile.rakuten.co.jp/faq/detail/00001238/'

# Send a GET request to the URL
response = requests.get(url)

# Ensure the request was successful
if response.status_code == 200:
    # Parse the content of the response using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Print out the prettified HTML
    reference_webpate_html = soup.prettify()
    print(reference_webpate_html)
else:
    print('Failed to retrieve the webpage')


In [None]:
from IPython.display import HTML

display(HTML(reference_webpate_html))

---

## LangGraph - LLMs 

We build two LLMs: 
* Fetch Answer either from vector DB or webbase
* Put result in HTML format 

In [None]:
import time
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
# from langchain_mistralai.chat_models import ChatMistralAI
from langchain_core.output_parsers import JsonOutputParser

# we use locally hosted llm models 
llm = ChatOllama(model='llama3', format="json", temperature=0.4)

# reference: https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_crag_local.ipynb

prompt = PromptTemplate(
    template="""You are a grader assessing relevance of a retrieved document to a user question. \n 
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n
    If the document contains keywords related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explaination.""",
    input_variables=["question", "document"],
)

retrieval_grader = prompt | llm | JsonOutputParser()

question = "How do I get started with Copilot?"

docs = retriever.get_relevant_documents(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

In [None]:
### Generate

from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# Prompt
prompt_data_gen = PromptTemplate(
    template="""
    You are a web developer fluent in HTML and knowledgeable software engineer.
    
    Question: \n\n Use document as a reference to craft a response to a question in HTML format. Insert plots or flowcharts whenever needed. Question is "{question}" \n\n
    Context: \n\n {context} \n\n 

    Please output only HTML code.
    """,
    
    input_variables=["context", "ref_html", "question"],
)


# LLM
if run_local == "Yes":
    llm = ChatOllama(model=local_llm, temperature=0.3)
else:
    llm = ChatMistralAI(
        model="mistral-medium", temperature=0, mistral_api_key=mistral_api_key
    )


# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Chain
rag_chain = prompt_data_gen | llm | StrOutputParser()

# Run
generation = rag_chain.invoke({"context": docs, "ref_html": reference_webpate_html , "question": question})
print(generation)

In [None]:
html_code = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8" />
    <meta name="description" content="GitHub Copilot Chat can help you by providing answers to coding related questions  directly within a supported IDE." />
    <title>About GitHub Copilot Chat in your IDE - GitHub Docs</title>
</head>
<body>
    <h1>About GitHub Copilot Chat in your IDE - GitHub Docs</h1>
    <p>Getting started with GitHub Copilot - GitHub DocsSkip to main contentGitHub DocsVersion: Free, Pro, &amp; TeamSearch GitHub DocsGitHub Copilot/Use GitHub Copilot/Getting startedHomeGitHub CopilotAbout GitHub CopilotQuickstartUse GitHub CopilotGetting startedFinding matching codeCopilot IndividualAbout GitHub Copilot IndividualCopilot Individual feature setCopilot BusinessAbout GitHub Copilot BusinessCopilot Business feature setEnabling GitHub Copilot BusinessCopilot EnterpriseOverviewAbout Copilot EnterpriseCopilot Enterprise feature setCopilot Chat in GitHub.comAbout Copilot Chat in GitHub.comCopilot pull request summariesAbout PR summariesManage Copilot in your organizationManaging accessManaging policiesExcluding contentAudit logsCopilot ChatAbout Copilot Chat (Mobile)Enabling Copilot Chat (Mobile)Use Copilot Chat (Mobile)About Copilot Chat (IDE)Use Copilot Chat (IDE)Copilot in the CLIAbout Copilot in the CLISetting up Copilot in the CLIUsing Copilot in the CLIConfiguring Copilot in the CLIConfigure GitHub CopilotGitHub.comIn your environmentNetwork settingsTroubleshootingCommon issues with GitHub CopilotView logsFirewall settingsNetwork errorsCopilot Chat</p>
    <h1>Getting started with GitHub Copilot</h1>
    <p>You can start using GitHub Copilot by installing the extension in your preferred environment.</p>
    <p>Who can use this feature?</p>
    <p>GitHub Copilot can be managed through personal accounts with GitHub Copilot Individual or through organization accounts with GitHub Copilot Business.</p>
    <h1>About GitHub Copilot Business</h1>
    <p>With GitHub Copilot Business you can manage access to GitHub Copilot for your organization.</p>
    <p>Who can use this feature?</p>
    <p>GitHub Copilot can be managed through personal accounts with GitHub Copilot Individual or through organization accounts with GitHub Copilot Business.</p>
    <p>In this article</p>
    <p>About Copilot Business</p>
    <p>Enabling and setting up Copilot Business</p>
    <p>About billing for Copilot Business</p>
    <p>Requesting or granting access to Copilot</p>
    <p>Further reading</p>
    <p>Get GitHub Copilot Business</p>
    <p>About Copilot Business</p>
    <p>GitHub Copilot is an AI-powered coding assistant that helps developers write code faster.</p>
    <p>With Copilot Business, you can manage access to GitHub Copilot for organizations. Once you grant an organization access to GitHub Copilot, the administrators of that organization can grant access to individuals and teams. For more information, see "Enabling and setting up GitHub Copilot Business."</p>
</body>
</html>
"""

display(HTML(html_code))

In [None]:
## Create two templates: 
# 1. pig code to benchmark input data
prompt_data_gen = PromptTemplate(
    template="""
    You are an expert data scientist fluent in PIG and Python coding languages.
    Generate Python code that do the following: 
    1. Generate 20 lines or more CSV data that can be used to test the PIG code. 
       Ensure column names are consistent with the names in PIG code. 
    2. Write Python code that save this CSV data to the directory provided. 
        
    Here is the PIG code: \n\n {pig_code} \n\n
    Here is the directory to save CSV file: \n\n {sample_input_path} \n\n

    Give a string of Python code with correct indentation that can be ran to create and save CSV file to correct path. 
    Provide this as a JSON with a single key 'data_gen_code' and no premable or explaination.""",
    input_variables=["pig_code", "sample_input_path"],
)
sample_input_code_generator = prompt_data_gen | llm | JsonOutputParser()

prompt_data_regen = PromptTemplate(
    template="""
    You are an expert data scientist fluent in PIG and Python coding languages.
    Generate Python code that do the following: 
    * Debug and share updated Python code to generate 20 lines or more CSV data that can be used to thest the PIG code. 
    * Use the error message and the data that resulted in error as a reference to fix the Python code. 
        
    Here is the PIG code: \n\n {pig_code} \n\n
    Here is the Python code with error: \n\n {pycode_error} \n\n
    Here is the Python code error message: \n\n {pycode_error_message} \n\n
    Here is the directory to save CSV file: \n\n {sample_input_path} \n\n

    Give a string of Python code with correct indentation that can be ran to create and save CSV file to correct path. 
    Provide this as a JSON with a single key 'data_gen_code' and no premable or explaination.""",
    input_variables=["pig_code", "pycode_error", "pycode_error_message", "sample_input_path"],
)
fix_sample_input_code_generator = prompt_data_regen | llm | JsonOutputParser()


# 2. pig code to pyspark code 
prompt_pig2pyspark = PromptTemplate(
    template="""
    You are an expert data scientist fluent in PIG and PySpark coding languages.
    Generate PySpark code that do the following: 
    * Implement same logic and methods as the provided PIG code. 
    * When ran against a sample input data, outputs identical result as PIG code. 
        
    Here is the PIG code: \n\n {pig_code} \n\n

    Give a string of PySpark code with correct indentation. 
    Provide this as a JSON with a single key 'pyspark_code' and no premable or explaination.""",
    input_variables=["pig_code"],
)
pig_to_pyspark_converter = prompt_pig2pyspark | llm | JsonOutputParser()

prompt_pig2pyspark_regen = PromptTemplate(
    template="""
    You are an expert data scientist fluent in PIG and PySpark coding languages.
    Generate PySpark code that do the following: 
    * Implement same logic and methods as the provided PIG code. 
    * Use the PySpark code that returned an error message to update the PySpark code. 
    * Use the PySpark code error message to update the PySpark code. 
    * When ran against a sample input data, outputs identical result as PIG code. 
        
    Here is the PIG code: \n\n {pig_code} \n\n
    Here is the PySpark code with error: \n\n {pycode_error} \n\n
    Here is the PySpark code error message: \n\n {pycode_error_message} \n\n

    Give a string of PySpark code with correct indentation. 
    Provide this as a JSON with a single key 'pyspark_code' and no premable or explaination.""",
    input_variables=["pig_code", "pycode_error", "pycode_error_message"],
)
fix_pig_to_pyspark_converter = prompt_pig2pyspark | llm | JsonOutputParser()


In [None]:
## test with sample PIG code 
# load PIG code
pig_script_dir = './scripts/pig1.pig'

with open(pig_script_dir, 'r') as file:
    sample_pig_code = file.read()

print(sample_pig_code)
data_output_dir = './data'

########################################
datagen_code = sample_input_code_generator.invoke({"pig_code": sample_pig_code, "sample_input_path": data_output_dir})
print('*'*88)
print(datagen_code['data_gen_code'])
print('*'*88)

**NOTE:** Run `$ollama pull model-name` before using. 

---