In [1]:
#!pip install langchain

In [20]:
import os
import argparse
import logging
import logging.config
import base64

from dotenv import load_dotenv
from github import Github
import src.github_utils as gu

from langchain.document_loaders.base import BaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.llms import OpenAI


In [21]:
# Load environment variables from .env file
load_dotenv()

GITHUB_API_TOKEN = os.getenv("GITHUB_API_TOKEN")
OPEN_API_TOKEN = os.getenv("OPENAI_API_TOKEN")
PINECONE_API_TOKEN = os.getenv("PINECONE_API_TOKEN")

In [22]:
g = Github(GITHUB_API_TOKEN)
g

<github.MainClass.Github at 0x11eaba010>

In [23]:
repo_name = "hyeniii/auto-readme"
repo = g.get_repo(repo_name)

In [6]:
contents = repo.get_contents("")
contents

[ContentFile(path=".gitignore"),
 ContentFile(path="LICENSE"),
 ContentFile(path="README.md"),
 ContentFile(path="config"),
 ContentFile(path="requirements.txt"),
 ContentFile(path="sandbox.py"),
 ContentFile(path="src")]

In [7]:
all_files = gu.get_all_files(repo)
all_files

{'.gitignore': 'LmVudgoudmVudi8KLnZzY29kZS8KKi5sb2cKKi5weWM=\n',
 'LICENSE': 'TUlUIExpY2Vuc2UKCkNvcHlyaWdodCAoYykgMjAyMyBIeWUgV29uIChOaWNv\nbGUpIEh3YW5nCgpQZXJtaXNzaW9uIGlzIGhlcmVieSBncmFudGVkLCBmcmVl\nIG9mIGNoYXJnZSwgdG8gYW55IHBlcnNvbiBvYnRhaW5pbmcgYSBjb3B5Cm9m\nIHRoaXMgc29mdHdhcmUgYW5kIGFzc29jaWF0ZWQgZG9jdW1lbnRhdGlvbiBm\naWxlcyAodGhlICJTb2Z0d2FyZSIpLCB0byBkZWFsCmluIHRoZSBTb2Z0d2Fy\nZSB3aXRob3V0IHJlc3RyaWN0aW9uLCBpbmNsdWRpbmcgd2l0aG91dCBsaW1p\ndGF0aW9uIHRoZSByaWdodHMKdG8gdXNlLCBjb3B5LCBtb2RpZnksIG1lcmdl\nLCBwdWJsaXNoLCBkaXN0cmlidXRlLCBzdWJsaWNlbnNlLCBhbmQvb3Igc2Vs\nbApjb3BpZXMgb2YgdGhlIFNvZnR3YXJlLCBhbmQgdG8gcGVybWl0IHBlcnNv\nbnMgdG8gd2hvbSB0aGUgU29mdHdhcmUgaXMKZnVybmlzaGVkIHRvIGRvIHNv\nLCBzdWJqZWN0IHRvIHRoZSBmb2xsb3dpbmcgY29uZGl0aW9uczoKClRoZSBh\nYm92ZSBjb3B5cmlnaHQgbm90aWNlIGFuZCB0aGlzIHBlcm1pc3Npb24gbm90\naWNlIHNoYWxsIGJlIGluY2x1ZGVkIGluIGFsbApjb3BpZXMgb3Igc3Vic3Rh\nbnRpYWwgcG9ydGlvbnMgb2YgdGhlIFNvZnR3YXJlLgoKVEhFIFNPRlRXQVJF\nIElTIFBST1ZJREVEICJBUyBJUyIsIFdJVEhPVVQgV0FSUkFOVFkgT0

In [8]:
decoded_files = gu.decode_files(all_files)
decoded_files

{'.gitignore': '.env\n.venv/\n.vscode/\n*.log\n*.pyc',
 'LICENSE': 'MIT License\n\nCopyright (c) 2023 Hye Won (Nicole) Hwang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nL

In [9]:
decoded_files_flatten = gu.flatten(decoded_files)
decoded_files_flatten

{'.gitignore': '.env\n.venv/\n.vscode/\n*.log\n*.pyc',
 'LICENSE': 'MIT License\n\nCopyright (c) 2023 Hye Won (Nicole) Hwang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nL

In [10]:
# Initialize the SourceCodeLoader with the decoded files
class CustomCodeLoader(BaseLoader):
    # Mapping of file extensions to their corresponding languages
    extension_to_language = {
        '.py': 'python',
        '.ipynb': 'jupyter notebook',
        '.r': 'R',
        '.js': 'javascript',
        # Add more mappings as needed
    }

    def __init__(self, files):
        self.files = files

    def get_language_from_extension(self, file_path):
        _, extension = os.path.splitext(file_path)
        return self.extension_to_language.get(extension.lower(), 'unknown')

    def load(self):
        for path, content in self.files.items():
            if isinstance(content, str):
                language = self.get_language_from_extension(path)
                document = {
                    'page_content': content,
                    'metadata': {
                        'path': path,
                        'language': language
                    }
                }
                yield document
                
custom_loader = CustomCodeLoader(decoded_files_flatten)

In [11]:
# EXAMPLE OF CUSTOM LOADER
for document in custom_loader.load():
    # Each document's 'content' is the file content
    # and 'metadata' contains the file path and deduced language
    print(document['metadata']['path'], document['metadata']['language'])

.gitignore unknown
LICENSE unknown
README.md unknown
config/logs/local.conf unknown
requirements.txt unknown
sandbox.py python
src/__init__.py python
src/github_utils.py python


In [12]:
documents = list(custom_loader.load())
documents

[{'page_content': '.env\n.venv/\n.vscode/\n*.log\n*.pyc',
  'metadata': {'path': '.gitignore', 'language': 'unknown'}},
 {'page_content': 'MIT License\n\nCopyright (c) 2023 Hye Won (Nicole) Hwang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAU

In [13]:
texts = [document["page_content"] for document in documents]
metadata = [document["metadata"] for document in documents]

In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
    add_start_index = True,
)

documents = text_splitter.create_documents(texts, metadata)
documents
chunks = text_splitter.split_documents(documents)
chunks

[Document(page_content='.env\n.venv/\n.vscode/\n*.log\n*.pyc', metadata={'path': '.gitignore', 'language': 'unknown', 'start_index': 0}),
 Document(page_content='MIT License\n\nCopyright (c) 2023 Hye Won (Nicole) Hwang', metadata={'path': 'LICENSE', 'language': 'unknown', 'start_index': 0}),
 Document(page_content='Permission is hereby granted, free of charge, to any person obtaining a copy', metadata={'path': 'LICENSE', 'language': 'unknown', 'start_index': 0}),
 Document(page_content='of this software and associated documentation files (the "Software"), to deal', metadata={'path': 'LICENSE', 'language': 'unknown', 'start_index': 0}),
 Document(page_content='in the Software without restriction, including without limitation the rights', metadata={'path': 'LICENSE', 'language': 'unknown', 'start_index': 0}),
 Document(page_content='to use, copy, modify, merge, publish, distribute, sublicense, and/or sell', metadata={'path': 'LICENSE', 'language': 'unknown', 'start_index': 0}),
 Document

In [15]:
#!pip install pinecone-client
#!pip install tiktoken 
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone


# --- Vector Storage --- 

# Initialize pinecone
pinecone.init(api_key = PINECONE_API_TOKEN, environment = 'gcp-starter')

INDEX_NAME = "final-project"

# Delete exiting index
#pinecone.delete_index("gen-ai-hw7-ali8110")
pinecone.delete_index(INDEX_NAME)

# Create index from zero
if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(INDEX_NAME, dimension=1536, metric="cosine")
    
# Connect to pinecone index
index = pinecone.Index(index_name = INDEX_NAME)

# Initialize the embeddings model with OpenAI
embeddings_model = OpenAIEmbeddings(openai_api_key=OPEN_API_TOKEN, disallowed_special=())

# Upload documents 
docs_upload = Pinecone.from_documents(chunks, embeddings_model, index_name = INDEX_NAME)

# --- Set Retrieval ---
retriever = docs_upload.as_retriever()

  from tqdm.autonotebook import tqdm


In [16]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
model = ChatOpenAI(openai_api_key = OPEN_API_TOKEN,
                   model_name = "gpt-4",
                   temperature = 0.7)

# Create ConversationalRetrievalChain
qa_chain = ConversationalRetrievalChain.from_llm(model, retriever = retriever)

In [24]:
question = "In which language is the code written?"
chat_history = []
response = qa_chain({"question": question, "chat_history": chat_history})
response["answer"]

"The text doesn't provide information about the language in which the code is written."

In [25]:
question = "What is the purpose of the code?"
chat_history = []
response = qa_chain({"question": question, "chat_history": chat_history})
response["answer"]

'The purpose of the code is to automatically generate `readme.md` files for repositories by parsing the code and summarizing it.'

In [27]:
question = "What are the files in the repository?"
chat_history = []
response = qa_chain({"question": question, "chat_history": chat_history})
response["answer"]

"The text doesn't provide specific information about what files are in the repository."