In [9]:
import warnings

warnings.filterwarnings('ignore')

In [10]:
import os
import requests
from dotenv import load_dotenv
from langchain_core.documents import Document

In [None]:
import faiss
import numpy as np
import re  # For text cleaning
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import VectorStore

In [11]:
github_token = os.getenv("GITHUB_TOKEN")

In [12]:
load_dotenv()

github_token = os.getenv("GITHUB_TOKEN")

In [13]:
def fetch_github(owner, repo, endpoint):
    url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
    headers = {"Authorization": f"Bearer {github_token}"}
    all_data = []
    page = 1

    while True:
        response = requests.get(url, headers=headers, params={"page": page})
        if response.status_code == 200:
            data = response.json()
            if not data:  # Break if no more data
                break
            all_data.extend(data)
            page += 1
        else:
            print("Failed with status code:", response.status_code)
            return []

    return all_data


def fetch_github_issues(owner, repo,endpoint):
    data = fetch_github(owner, repo, endpoint)
    return load_issues(data,endpoint,repo)


def load_issues(data,endpoint,repo):
    docs = []
    for entry in data:
        str_data = entry.get("title", "") 
        metadata = {
            "type": endpoint,
            "repo": repo,
            "author": entry["user"]["login"],
            "comments": entry["comments"],
            "body": entry["body"],
            "labels": entry["labels"],
            "created_at": entry["created_at"][0:10], ## slicing the extra part
        }
        if entry['body']:
            str_data += " "
            str_data += entry['body']
        doc = Document(page_content=str_data, metadata=metadata)
        docs.append(doc)

    return docs

In [None]:
owner = "microsoft"
repo = "DeepSpeed"
docs = fetch_github_issues(owner, repo, "issues")  # Fetch issues from the specified repo

    # Extract and print the created date of each issue
#for doc in docs:
        #created_at = doc.metadata.get('created_at')
        #print(f"Issue created at: {created_at}")

In [16]:
class FAISStore(VectorStore):
    def __init__(self):
        # Initialize FAISS index with a flat index type
        self._embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        d = 384  # Dimension of embeddings
        self.index = faiss.IndexFlatL2(d)  # Use a flat index without clustering
        self.documents = []

    @property
    def embeddings(self):
        return self._embeddings

    def add_docs(self, docs):
        vectors_to_upsert = []

        for doc in docs:
            # Encode the cleaned document content into embeddings
            embed_docs = self.embeddings.encode(doc.page_content).astype('float32')

            # Create a unique ID for the document
            unique_id = doc.metadata.get("author", "unknown_author") + "_" + doc.metadata.get("type", "unknown_type")

            # Append vector and unique ID
            vectors_to_upsert.append((unique_id, embed_docs))

            # Store the document for future retrieval
            self.documents.append((unique_id, doc))  # Store Document object directly

        # Upsert vectors into FAISS
        embed_docs_array = np.array([vec for _, vec in vectors_to_upsert]).astype('float32')
        self.index.add(embed_docs_array)  # Add vectors to the index

    def search(self, query, k=1):
        # Encode the query into an embedding
        query_embedding = self.embeddings.encode(query).astype('float32').reshape(1, -1)

        # Perform the similarity search
        D, I = self.index.search(query_embedding, k=k)

        # Retrieve metadata and content for the results
        results = []
        for idx in I[0]:
            if idx >= 0:
                unique_id, document = self.documents[idx]
                results.append(document)

        return results  # Return Document objects

    def similarity_search(self, query, k=1):
        return self.search(query, k)

    def from_texts(self, texts, metadatas=None):
        """ Takes a list of texts and corresponding metadata, creates Documents, and adds them to the vector store. """
        docs = [Document(page_content=self.preprocess_content(text), metadata=metadata)
                for text, metadata in zip(texts, metadatas or [{}]*len(texts))]
        self.add_docs(docs)

In [None]:
store = FAISStore()
owner = "microsoft"
repo = "DeepSpeed"

    # Fetch GitHub pull requests and add them to FAISS
docs = fetch_github_issues(owner, repo, "issues")
store.add_docs(docs)

    # Query the FAISS index
result = store.similarity_search("Fix bug with hybrid engine generation")
print(result)

In [19]:
from langchain_groq import ChatGroq  # Assuming you are using Groq for chat
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain import hub
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import initialize_agent
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor
from langchain.prompts import PromptTemplate

In [27]:
FLAG_FILE = "data_loaded.flag"

class Agent:
    def __init__(self):
        # Initialize FAISS store separately
        self.vector_store = FAISStore()
        
        # Initialize memory for conversation
        self.conversational_memory = ConversationBufferMemory(
            memory_key='chat_history',
            return_messages=True  # Store messages as a list
        )
        
        # Initialize the LLM
        self.llm = ChatGroq(
            temperature=0.0,
            model='llama-3.1-70b-versatile',
            api_key=os.getenv('GROQ_API_KEY'),
            verbose=True
        )
        
    def _run(self, response):
        template = '''This is a response from github agent. Make the Response well Structured and formatted!!
        Here is the response from the agent: {response}'''
        
        prompt = PromptTemplate(template=template, input_variables=['response'])
        formatted_prompt = prompt.format(response=response)
        return self.llm.invoke(formatted_prompt)
        
    
    def initialize(self, owner, repo, endpoint):
        if not os.path.exists(FLAG_FILE):  # Check if the flag file exists
            print("No data found in the FAISS store. Fetching data from GitHub...")
            docs = fetch_github_issues(owner, repo, endpoint)  # Fetch issues/pulls
            if docs:  # Only add if documents were fetched
                self.vector_store.add_docs(docs)  # Add docs to the FAISS store
                with open(FLAG_FILE, "w") as f:  # Create a flag file to indicate data has been loaded
                    f.write("Data loaded")
                print(f"Added {len(docs)} documents to the FAISS store.")
            else:
                print("No documents fetched from GitHub.")
        else:
            user_input = input("Data is already loaded. Do you want to re-fetch it from GitHub? (yes/no): ").strip().lower()
            if user_input == 'yes':
                print("Re-fetching data from GitHub...")
                docs = fetch_github_issues(owner, repo, endpoint)  # Fetch issues/pulls
                if docs:
                    self.vector_store.add_docs(docs)  # Add docs to the FAISS store
                    print(f"Added {len(docs)} documents to the FAISS store.")
                else:
                    print("No documents fetched from GitHub.")
            else:
                print("Using existing data from the FAISS store.")

    def make_agent(self):
        # Set up the retrieval-based question answering chain
        retriever = self.vector_store.as_retriever()  # Use `as_retriever` to make it compatible with RetrievalQA

        # Create the retriever tool
        self.retriever_tool = create_retriever_tool(
            retriever,
            "GitHub Search",
            'The user is asking question which is related to this tool .Use this tool for any question . It will search the GitHub repository for relevant issues and pull requests.'
        )

        # Initialize the agent
        tools = [self.retriever_tool]
        #prompt = hub.pull("hwchase17/openai-functions-agent")
        #agent = create_tool_calling_agent(self.llm, tools, prompt)
        #self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
        
        self.agent_executor = initialize_agent(
            llm=self.llm,
            agent='conversational-react-description', 
            tools=tools,
            verbose=True,
            max_iterations=3,
            memory=self.conversational_memory
)

    def run_query(self, query):
        """Run a query through the agent and return the response."""
        response = self.agent_executor({"input": query})
        res=self._run(response)
        return res

In [None]:
agent = Agent()
    
    # Initialize the agent with appropriate parameters
agent.initialize(owner='microsoft', repo='DeepSpeed', endpoint='issues')
agent.make_agent()  # Initialize the agent tools