In [10]:
# POST /sre_query

import os
import json
from datetime import datetime, timedelta

import openai

from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferMemory

# ----- Keys ------ #
OPENAI_API_KEY = ""
HASURA_GRAPHQL_URL=""
PAT_TOKEN=""
HASURA_GRAPHQL_ADMIN_SECRET=""
openai.api_key = OPENAI_API_KEY

# ----- Additional headers to pass in Hasura request ------ #
additional_headers = {}
# additional_headers = {'x-hasura-project-id': 'Project_2',
#                       'x-hasura-role': 'sre'}


# ----- Hasura utility ------ #
class HasuraClient:
    def __init__(self,hasura_graphql_url, hasura_graphql_admin_secret, 
                 hasura_graphql_pat_token, additional_headers={}):
        headers = {'x-hasura-admin-secret': hasura_graphql_admin_secret,
                     'Authorization': 'pat {token}'.format(token=hasura_graphql_pat_token)}
        headers.update(additional_headers)

        transport=RequestsHTTPTransport(
            url=hasura_graphql_url,
            headers=headers,
        )
        self.client = Client(
            transport=transport
        )

    def execute_gql(self, query, variable_values={}):
        query = gql(query)
        return self.client.execute(query, variable_values=variable_values)
    
hcl = HasuraClient(HASURA_GRAPHQL_URL, HASURA_GRAPHQL_ADMIN_SECRET, PAT_TOKEN, additional_headers)

# ----- Function schema for entity extraction from question ------ #
functions = [
    {
        "name": "fetch_incident_details", 
        "description": "Identifies if the user is asking for analysis of an incident ticket and extracts the incident id",
        "parameters": {
            "type": "object",
            "properties": {
                "analysis_requested": {
                    "type": "boolean",
                    "description": "Boolean flag to indicate whether the user has requested for analysis."
                },
                "incident_id":{
                    "type": "string",
                    "description": "Incident id from the text on which the user has requested for analysis."
                }
                
            }
        }
    }
] 

# ----- Queries for context ------ #
request_query = """
query MyQuery($IncidentId: Int64!) {
  incident(where: {incident_id: {_eq: $IncidentId}}) {
    incident_id
    request_id
    timestamp
    incident_request_relationship {
      cpu_usage
      execution_time
      memory_usage
      number_of_active_requests
      request_id
      server_id
      project_id
      timestamp
    }
  }
}

"""


system_stats_query = """
query MyQuery($StartTime: DateTime64!, $EndTime: DateTime64!, $ServerId: String!) {
  aggregated_server_metrics(where: {timestamp: {_gt: $StartTime, _lt: $EndTime}, server_id: {_eq: $ServerId}}) {
    avg_cpu_usage
    avg_memory_usage
    server_id
    timestamp
    total_requests
  }
}

"""

def fetch_db_data(incident_id, duration=6):
    request_data = hcl.execute_gql(request_query, {"IncidentId":incident_id})
    if request_data['incident'] == []:
        return None
    
    time_of_incident = request_data['incident'][0]['timestamp'][:-10]
    time_of_incident = datetime.strptime(time_of_incident, '%Y-%m-%d %H:%M:%S')
    
    n_hours_before_incident = time_of_incident - timedelta(hours=duration)
    n_hours_after_incident = time_of_incident + timedelta(hours=duration)
    server_id = request_data['incident'][0]['incident_request_relationship']['server_id']

    server_stats = hcl.execute_gql(system_stats_query, {"StartTime": n_hours_before_incident.strftime('%Y-%m-%d %H:%M:%S'),
                                                      "EndTime": n_hours_after_incident.strftime('%Y-%m-%d %H:%M:%S'),
                                                      "ServerId": server_id})

    return request_data, server_stats

def generate_context_augmented_question(incident_details):
    prompt = """
                You are provided with request and server stats data for the incident.
                Can you run RCA and identify the root cause of the incident?

                Request data: 
                {request_data}

                Server stats data:
                {server_stats}

                Analyse the data step by step and list down your analysis in bullet.
                Share next steps to resolve the incident.
                        
            """
    incident_id = str(incident_details["incident_id"])
    request_data, server_stats = fetch_db_data(incident_id)
    prompt = prompt.format(request_data=request_data, server_stats=server_stats)
    return prompt


def call_llm(question, functions=functions):
    global OPENAI_API_KEY
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        messages=[{"role": "user", "content": question }],
        functions=functions,
        function_call="auto",
    )
    message = response["choices"][0]["message"]
    if message.get("function_call"):
        incident_details = json.loads(message["function_call"]['arguments'])
        new_question = generate_context_augmented_question(incident_details)

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0613",
            messages=[{"role": "user", "content": new_question }],
        )
    return response["choices"][0]["message"]["content"]


def handle_sre_question(request):
    question = request['body']['input']['question']

    answer = call_llm(question)
    return "answer"

print(json.dumps(handle_sre_question(json.loads(REQUEST))))

In [11]:
call_llm("analysis incident 3")

"Analysis:\n- The request data shows that there was one incident with incident_id 3 and request_id 6 on August 1, 2023, at 00:00:00.000000000. The incident was related to a request handled by Server_2 for Project_3.\n- The CPU usage during the incident was 64.01%, the execution time was 22.24 seconds, the memory usage was 68.56%, and there were 77 active requests at that time.\n\n- The server stats data shows the aggregated metrics for Server_2 at different timestamps on August 1, 2023. The average CPU usage varied from 49.66% to 52.62%, and the average memory usage varied from 45.88% to 54.64%.\n- The total number of requests handled by Server_2 ranged from 5575 to 6286 during the different timestamps.\n\nNext steps to resolve the incident:\n1. Identify if the incident was caused by a server overload or performance issue. This can be done by comparing the CPU and memory usage during the incident with the average usage at that timestamp. If the incident's CPU and memory usage are signi