In [2]:
#!pip install langchain

In [1]:
import os
import argparse
import logging
import logging.config
import base64
import sys

from dotenv import load_dotenv
from github import Github

import src.github_utils as gu

from langchain.document_loaders.base import BaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.llms import OpenAI

from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import BaseOutputParser


In [2]:
# Load environment variables from .env file
load_dotenv()

GITHUB_API_TOKEN = os.getenv("GITHUB_API_TOKEN")
OPEN_API_TOKEN = os.getenv("OPENAI_API_TOKEN")
PINECONE_API_TOKEN = os.getenv("PINECONE_API_TOKEN")

In [3]:
g = Github(GITHUB_API_TOKEN)
g

<github.MainClass.Github at 0x1072a4f10>

In [4]:
repo_name = "hyeniii/auto-readme"
repo = g.get_repo(repo_name)

In [None]:
#collabs = repo.get_collaborators()
#for i in collabs:
#    print(i)

In [5]:
contents = repo.get_contents("")
contents

[ContentFile(path=".gitignore"),
 ContentFile(path="LICENSE"),
 ContentFile(path="README.md"),
 ContentFile(path="config"),
 ContentFile(path="requirements.txt"),
 ContentFile(path="sandbox.py"),
 ContentFile(path="src")]

In [6]:
all_files = gu.get_all_files(repo)
all_files

{'.gitignore': 'LmVudgoudmVudi8KLnZzY29kZS8KKi5sb2cKKi5weWM=\n',
 'LICENSE': 'TUlUIExpY2Vuc2UKCkNvcHlyaWdodCAoYykgMjAyMyBIeWUgV29uIChOaWNv\nbGUpIEh3YW5nCgpQZXJtaXNzaW9uIGlzIGhlcmVieSBncmFudGVkLCBmcmVl\nIG9mIGNoYXJnZSwgdG8gYW55IHBlcnNvbiBvYnRhaW5pbmcgYSBjb3B5Cm9m\nIHRoaXMgc29mdHdhcmUgYW5kIGFzc29jaWF0ZWQgZG9jdW1lbnRhdGlvbiBm\naWxlcyAodGhlICJTb2Z0d2FyZSIpLCB0byBkZWFsCmluIHRoZSBTb2Z0d2Fy\nZSB3aXRob3V0IHJlc3RyaWN0aW9uLCBpbmNsdWRpbmcgd2l0aG91dCBsaW1p\ndGF0aW9uIHRoZSByaWdodHMKdG8gdXNlLCBjb3B5LCBtb2RpZnksIG1lcmdl\nLCBwdWJsaXNoLCBkaXN0cmlidXRlLCBzdWJsaWNlbnNlLCBhbmQvb3Igc2Vs\nbApjb3BpZXMgb2YgdGhlIFNvZnR3YXJlLCBhbmQgdG8gcGVybWl0IHBlcnNv\nbnMgdG8gd2hvbSB0aGUgU29mdHdhcmUgaXMKZnVybmlzaGVkIHRvIGRvIHNv\nLCBzdWJqZWN0IHRvIHRoZSBmb2xsb3dpbmcgY29uZGl0aW9uczoKClRoZSBh\nYm92ZSBjb3B5cmlnaHQgbm90aWNlIGFuZCB0aGlzIHBlcm1pc3Npb24gbm90\naWNlIHNoYWxsIGJlIGluY2x1ZGVkIGluIGFsbApjb3BpZXMgb3Igc3Vic3Rh\nbnRpYWwgcG9ydGlvbnMgb2YgdGhlIFNvZnR3YXJlLgoKVEhFIFNPRlRXQVJF\nIElTIFBST1ZJREVEICJBUyBJUyIsIFdJVEhPVVQgV0FSUkFOVFkgT0

In [7]:
decoded_files = gu.decode_files(all_files)
decoded_files

{'.gitignore': '.env\n.venv/\n.vscode/\n*.log\n*.pyc',
 'LICENSE': 'MIT License\n\nCopyright (c) 2023 Hye Won (Nicole) Hwang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nL

In [8]:
decoded_files_flatten = gu.flatten(decoded_files)
decoded_files_flatten

{'.gitignore': '.env\n.venv/\n.vscode/\n*.log\n*.pyc',
 'LICENSE': 'MIT License\n\nCopyright (c) 2023 Hye Won (Nicole) Hwang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nL

In [9]:
# Initialize the SourceCodeLoader with the decoded files
class CustomCodeLoader(BaseLoader):
    # Mapping of file extensions to their corresponding languages
    extension_to_language = {
        '.py': 'python',
        '.ipynb': 'jupyter notebook',
        '.r': 'R',
        '.js': 'javascript',
        # Add more mappings as needed
    }

    def __init__(self, files):
        self.files = files

    def get_language_from_extension(self, file_path):
        _, extension = os.path.splitext(file_path)
        return self.extension_to_language.get(extension.lower(), 'unknown')

    def load(self):
        for path, content in self.files.items():
            if isinstance(content, str):
                language = self.get_language_from_extension(path)
                document = {
                    'page_content': content,
                    'metadata': {
                        'path': path,
                        'language': language
                    }
                }
                yield document
                
custom_loader = CustomCodeLoader(decoded_files_flatten)

In [10]:
# EXAMPLE OF CUSTOM LOADER
for document in custom_loader.load():
    # Each document's 'content' is the file content
    # and 'metadata' contains the file path and deduced language
    print(document['metadata']['path'], document['metadata']['language'])

.gitignore unknown
LICENSE unknown
README.md unknown
config/logs/local.conf unknown
requirements.txt unknown
sandbox.py python
src/__init__.py python
src/github_utils.py python


In [11]:
documents = list(custom_loader.load())
documents

[{'page_content': '.env\n.venv/\n.vscode/\n*.log\n*.pyc',
  'metadata': {'path': '.gitignore', 'language': 'unknown'}},
 {'page_content': 'MIT License\n\nCopyright (c) 2023 Hye Won (Nicole) Hwang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAU

### Get Repo Structure

In [12]:
# --- OUTPUT PARSER ---    
class MarkdownTreeStructureOutputParser(BaseOutputParser):
    """Parse the output of an LLM call to format the repository structure as a tree suitable for Markdown."""

    def parse(self, text: str):
        """Parse the output of an LLM call."""
        # Split the text into lines, each representing a file path
        paths = text.split("\n")

        # Organize paths into a tree structure
        tree = self.build_tree_structure(paths)

        # Format the tree structure for Markdown output
        formatted_text = "```\n" + self.format_tree(tree) + "```"
        return formatted_text

    def build_tree_structure(self, paths):
        """Build a tree structure from a list of paths."""
        tree = {}
        for path in paths:
            current_level = tree
            for part in path.split('/'):
                if part not in current_level:
                    current_level[part] = {}
                current_level = current_level[part]
        return tree

    def format_tree(self, tree, indent=0):
        """Recursively format the tree structure for output."""
        output = ""
        for key, value in tree.items():
            output += "    " * indent + f"{key}\n"  # Adjust the indentation if needed
            if isinstance(value, dict):
                output += self.format_tree(value, indent + 1)
        return output

# --- PROMPT TEMPLATE: SUMMARIZE REPO ---    
template = """
You are a helpful assistant who helps to build a README file. 
Return your answer as a tree generator with no further explanations. 
"""
human_template = 'This are the list of paths of the reposoitory files: {list_of_paths}. Give me the repo structure.'

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])

chain = chat_prompt | ChatOpenAI(openai_api_key=OPEN_API_TOKEN) | MarkdownTreeStructureOutputParser()
chain.invoke({"list_of_paths": [document["metadata"]["path"] for document in documents]})

'```\n.\n├── .gitignore\n├── LICENSE\n├── README.md\n├── config\n│   └── logs\n│       └── local.conf\n├── requirements.txt\n├── sandbox.py\n└── src\n    ├── __init__.py\n    └── github_utils.py\n```'

### Getting started instructions

In [22]:
class FormattedOutputConverToText(BaseOutputParser):
    """Parse the output of an LLM call to format as a simple string"""

    def parse(self, text: str):
        """Parse the output of an LLM call."""
        
        # Format the file path in bold and the summary as a paragraph
        formatted_text = f"{text}"
        return formatted_text
    

# --- PROMPT TEMPLATE: SUMMARIZE REPO ---    
template = """
You are a helpful assistant who helps built a README file for a Github repository. 
Give me instructions on how to to get started. detailing the steps for cloning the repository 
and the steps for installing dependencies. "

"""
human_template = "Repo Name: {repo}. List of files: {file_paths}. Content of files: {contents} "

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])

#chat_prompt.format_messages(language = d1["metatdata"]["language"], code = d1["page_content"])
    

chain = chat_prompt | ChatOpenAI(openai_api_key=OPEN_API_TOKEN) | FormattedOutputConverToText()
chain.invoke({"repo": repo_name, "file_paths": [doc["metadata"]["path"] for doc in documents],"contents": [doc["page_content"] for doc in documents]})

"To get started with the hyeniii/auto-readme repository, you'll need to follow these steps:\n\n## Cloning the Repository\n1. Open your terminal or command prompt.\n2. Change the current working directory to the location where you want to clone the repository.\n3. Run the following command to clone the repository:\n   ```\n   git clone https://github.com/hyeniii/auto-readme.git\n   ```\n4. Once the cloning process is complete, change the current working directory to the cloned repository:\n   ```\n   cd auto-readme\n   ```\n\n## Installing Dependencies\n1. Make sure you have Python 3.8 or higher installed on your system. If not, please install it before proceeding.\n2. Create a virtual environment by running the following command:\n   ```\n   python -m venv .venv\n   ```\n3. Activate the virtual environment:\n   - For Windows:\n     ```\n     .venv\\Scripts\\activate\n     ```\n   - For macOS and Linux:\n     ```\n     source .venv/bin/activate\n     ```\n4. Install the required package

### Get file summary: example for one file

In [None]:
# Example: github_utils.py
d1 = documents[7]
d1

In [None]:
# --- OUTPUT PARSER ---    
class FormattedOutputParserSummary(BaseOutputParser):
    """Parse the output of an LLM call to format the file path in bold and the summarization as a paragraph."""

    def parse(self, text: str):
        """Parse the output of an LLM call."""
        # Assuming the text format is "File Path: {file_path} Summary: {summary}"
        # Adjust the split logic based on the actual format you expect from the LLM
        parts = text.split("\nSummary: ")
        file_path = parts[0].replace("File Path: ", "").strip()
        summary = parts[1].strip() if len(parts) > 1 else ""

        # Format the file path in bold and the summary as a paragraph
        formatted_text = f"**{file_path}** \n\n{summary}"
        return formatted_text
    

# --- PROMPT TEMPLATE: SUMMARIZE REPO ---    
template = """
You are a helpful assistant who generates summarizations of code to build a README file. 
Return your response in the format File Path: the_file_path Summary: the_summary
"""
human_template = "The file {path} has this content: {content}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])

#chat_prompt.format_messages(language = d1["metatdata"]["language"], code = d1["page_content"])
    

chain = chat_prompt | ChatOpenAI(openai_api_key=OPEN_API_TOKEN) | FormattedOutputParserSummary()
chain.invoke({"path": d1["metadata"]["path"],"content": d1["page_content"]})

### Get file summaries: Example for whole repo

In [None]:
# Assuming 'documents' is your list of documents
summaries = []

for document in documents:
    # Extract the path and code content from each document
    path = document["metadata"]["path"]
    content = document["page_content"]

    # Invoke the chain for each document
    response = chain.invoke({"path": path, "content": content})

    # Append the generated summary to the summaries list
    summaries.append(response)

summaries


### Overview

In [21]:

# --- PROMPT TEMPLATE: SUMMARIZE REPO ---    
template = """
You are a helpful assistant who generates summarizations of code to build a README file. 
Give me a one paragraph with a brief overview of what is the repo for. 
"""
human_template = "Repo content: {documents}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])

#chat_prompt.format_messages(language = d1["metatdata"]["language"], code = d1["page_content"])
    

chain = chat_prompt | ChatOpenAI(openai_api_key=OPEN_API_TOKEN) | FormattedOutputConverToText()
#| FormattedOutputParserSummary()
chain.invoke({"documents": documents})

AIMessage(content="The repository contains code for an Auto-Readme tool. This tool automatically generates `readme.md` files for repositories by parsing the code and summarizing it using the OpenAI API. It has features such as parsing GitHub repositories without downloading them, summarizing code using Langchain, and generating human-readable summaries with OpenAI's GPT model. The prerequisites for using this tool include Python 3.8+, a GitHub API Token, and an OpenAI API Key. The installation process involves cloning the repository, creating a virtual environment, installing the required packages, and creating a `.env` file with the API tokens. The code files in the repository include a `.gitignore` file, a `LICENSE` file, a `README.md` file with detailed instructions, a logging configuration file, a `requirements.txt` file with the required dependencies, a `sandbox.py` file containing the main functionality of the tool, an `__init__.py` file, and a `github_utils.py` file with utility